Spaces:
Running
Running
kabudadada commited on
Commit ·
e76b79a
1
Parent(s): 6a001dc
Add esm folder and minimal app
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- Dockerfile +12 -0
- app.py +17 -0
- esm/mcp_output/README_MCP.md +144 -0
- esm/mcp_output/analysis.json +163 -0
- esm/mcp_output/env_info.json +17 -0
- esm/mcp_output/mcp_logs/llm_statistics.json +11 -0
- esm/mcp_output/mcp_logs/run_log.json +73 -0
- esm/mcp_output/mcp_plugin/__init__.py +0 -0
- esm/mcp_output/mcp_plugin/__pycache__/adapter.cpython-310.pyc +0 -0
- esm/mcp_output/mcp_plugin/__pycache__/mcp_service.cpython-310.pyc +0 -0
- esm/mcp_output/mcp_plugin/adapter.py +423 -0
- esm/mcp_output/mcp_plugin/main.py +13 -0
- esm/mcp_output/mcp_plugin/mcp_service.py +256 -0
- esm/mcp_output/predictions/prediction_20250823_235651.pdb +528 -0
- esm/mcp_output/predictions/prediction_20250830_220641.pdb +489 -0
- esm/mcp_output/requirements.txt +4 -0
- esm/mcp_output/start_mcp.py +34 -0
- esm/mcp_output/tests_mcp/test_mcp_basic.py +49 -0
- esm/mcp_output/tests_smoke/test_smoke.py +29 -0
- esm/source/.flake8 +10 -0
- esm/source/.git-blame-ignore-revs +2 -0
- esm/source/.github/ISSUE_TEMPLATE/bug.md +27 -0
- esm/source/.gitignore +31 -0
- esm/source/CODE_OF_CONDUCT.rst +6 -0
- esm/source/CONTRIBUTING.md +31 -0
- esm/source/LICENSE +21 -0
- esm/source/README.md +795 -0
- esm/source/__init__.py +4 -0
- esm/source/environment.yml +36 -0
- esm/source/esm/__init__.py +12 -0
- esm/source/esm/axial_attention.py +239 -0
- esm/source/esm/constants.py +10 -0
- esm/source/esm/data.py +493 -0
- esm/source/esm/esmfold/v1/__init__.py +0 -0
- esm/source/esm/esmfold/v1/categorical_mixture.py +43 -0
- esm/source/esm/esmfold/v1/esmfold.py +364 -0
- esm/source/esm/esmfold/v1/misc.py +309 -0
- esm/source/esm/esmfold/v1/pretrained.py +181 -0
- esm/source/esm/esmfold/v1/tri_self_attn_block.py +160 -0
- esm/source/esm/esmfold/v1/trunk.py +243 -0
- esm/source/esm/inverse_folding/__init__.py +8 -0
- esm/source/esm/inverse_folding/features.py +352 -0
- esm/source/esm/inverse_folding/gvp_encoder.py +56 -0
- esm/source/esm/inverse_folding/gvp_modules.py +475 -0
- esm/source/esm/inverse_folding/gvp_transformer.py +140 -0
- esm/source/esm/inverse_folding/gvp_transformer_encoder.py +184 -0
- esm/source/esm/inverse_folding/gvp_utils.py +68 -0
- esm/source/esm/inverse_folding/multichain_util.py +152 -0
- esm/source/esm/inverse_folding/transformer_decoder.py +228 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.p filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
RUN useradd -m -u 1000 user
|
| 3 |
+
USER user
|
| 4 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 8 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 9 |
+
|
| 10 |
+
COPY --chown=user . /app
|
| 11 |
+
EXPOSE 7860
|
| 12 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, WebSocket
|
| 2 |
+
|
| 3 |
+
app = FastAPI()
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@app.get("/")
|
| 7 |
+
async def root():
|
| 8 |
+
return {"status": "ok", "service": "Code2MCP-esm"}
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@app.websocket("/ws")
|
| 12 |
+
async def websocket_endpoint(ws: WebSocket):
|
| 13 |
+
await ws.accept()
|
| 14 |
+
await ws.send_text("WebSocket is up. Replace with your MCP/ESM handler.")
|
| 15 |
+
await ws.close()
|
| 16 |
+
|
| 17 |
+
|
esm/mcp_output/README_MCP.md
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ESM: Evolutionary Scale Modeling for Protein Sequences
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
`facebookresearch/esm` is an open-source project developed by Facebook AI Research (FAIR) for deep learning-based protein sequence modeling. It provides state-of-the-art tools for analyzing and predicting protein structures, functions, and variant effects using advanced language models and deep learning techniques.
|
| 6 |
+
|
| 7 |
+
### Key Features
|
| 8 |
+
|
| 9 |
+
- **Protein Language Models**: Pretrained models like ESM-1 and ESM-2 capture semantic information in protein sequences.
|
| 10 |
+
- **Multiple Sequence Alignment (MSA) Modeling**: Tools for protein modeling based on MSA, including MSA Transformer.
|
| 11 |
+
- **Inverse Folding**: Predict how protein sequences fold into 3D structures.
|
| 12 |
+
- **Variant Effect Prediction**: Assess the impact of mutations on protein functionality.
|
| 13 |
+
- **Contact Prediction**: Predict residue-residue contacts in protein sequences.
|
| 14 |
+
- **Metagenomic Analysis**: Analyze environmental protein sequences using the ESM Metagenomic Atlas.
|
| 15 |
+
- **Feature Extraction**: Tools like `esm-extract` for extracting features from pretrained models.
|
| 16 |
+
|
| 17 |
+
This repository is designed for researchers and developers in computational biology, bioinformatics, and related fields.
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## Installation
|
| 22 |
+
|
| 23 |
+
### Prerequisites
|
| 24 |
+
|
| 25 |
+
- Python 3.8 or later
|
| 26 |
+
- PyTorch 1.8 or later
|
| 27 |
+
- GPU support (optional but recommended for large-scale computations)
|
| 28 |
+
|
| 29 |
+
### Installation Steps
|
| 30 |
+
|
| 31 |
+
1. Clone the repository:
|
| 32 |
+
```
|
| 33 |
+
git clone https://github.com/facebookresearch/esm.git
|
| 34 |
+
cd esm
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
2. Install dependencies:
|
| 38 |
+
```
|
| 39 |
+
pip install -r requirements.txt
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
3. (Optional) Set up a virtual environment:
|
| 43 |
+
```
|
| 44 |
+
python -m venv esm_env
|
| 45 |
+
source esm_env/bin/activate
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
4. Install the package:
|
| 49 |
+
```
|
| 50 |
+
pip install .
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
5. (Optional) Install additional dependencies for specific features:
|
| 54 |
+
```
|
| 55 |
+
pip install fairscale pandas
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
## Usage
|
| 61 |
+
|
| 62 |
+
### Loading Pretrained Models
|
| 63 |
+
|
| 64 |
+
The repository provides pretrained models for various tasks. You can load a model using the following example:
|
| 65 |
+
|
| 66 |
+
```
|
| 67 |
+
from esm.pretrained import load_model_and_alphabet
|
| 68 |
+
model, alphabet = load_model_and_alphabet("esm2_t33_650M_UR50D")
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### Command-Line Tools
|
| 72 |
+
|
| 73 |
+
The repository includes several command-line tools for common tasks:
|
| 74 |
+
|
| 75 |
+
#### 1. `esm-extract`
|
| 76 |
+
Extract features from protein sequences using pretrained models.
|
| 77 |
+
|
| 78 |
+
**Usage:**
|
| 79 |
+
```
|
| 80 |
+
esm-extract --model esm2_t33_650M_UR50D --fasta input.fasta --output output.pt
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
#### 2. `esm-fold`
|
| 84 |
+
Predict the 3D structure of a protein sequence.
|
| 85 |
+
|
| 86 |
+
**Usage:**
|
| 87 |
+
```
|
| 88 |
+
esm-fold --model esm2_t33_650M_UR50D --fasta input.fasta --output output.pdb
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
## Available Tools and Endpoints
|
| 94 |
+
|
| 95 |
+
### Core Modules
|
| 96 |
+
|
| 97 |
+
- **`esm.pretrained`**: Load pretrained models.
|
| 98 |
+
- Functions: `load_model_and_alphabet`, `load_model_and_alphabet_local`
|
| 99 |
+
- **`esm.data`**: Handle protein sequence data.
|
| 100 |
+
- Functions: `Alphabet`, `BatchConverter`
|
| 101 |
+
- **`esm.inverse_folding`**: Tools for inverse folding tasks.
|
| 102 |
+
- Functions: `load_inverse_folding_model`
|
| 103 |
+
- Classes: `GVPTransformerEncoder`, `GVPTransformerDecoder`
|
| 104 |
+
- **`esm.model`**: Core model definitions.
|
| 105 |
+
- Classes: `ESM1`, `ESM2`, `MSATransformer`
|
| 106 |
+
|
| 107 |
+
### CLI Commands
|
| 108 |
+
|
| 109 |
+
- **`esm-extract`**: Extract features from protein sequences.
|
| 110 |
+
- **`esm-fold`**: Predict protein 3D structures.
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## Notes and Troubleshooting
|
| 115 |
+
|
| 116 |
+
### Notes
|
| 117 |
+
|
| 118 |
+
1. **Model Size**: Pretrained models like ESM-2 are large and may require significant memory. Use a GPU for optimal performance.
|
| 119 |
+
2. **Dependencies**: Ensure all required dependencies are installed. Optional dependencies like `fairscale` and `pandas` are needed for specific features.
|
| 120 |
+
3. **Input Formats**: Protein sequences should be provided in FASTA format for most tools.
|
| 121 |
+
|
| 122 |
+
### Troubleshooting
|
| 123 |
+
|
| 124 |
+
- **Out of Memory Errors**: If you encounter memory issues, try reducing batch size or using a smaller model.
|
| 125 |
+
- **Installation Issues**: Ensure you are using a compatible Python and PyTorch version.
|
| 126 |
+
- **Model Loading Errors**: Verify the model name and ensure the model weights are downloaded correctly.
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
## Contributing
|
| 131 |
+
|
| 132 |
+
We welcome contributions to improve the repository. Please follow the guidelines in the `CONTRIBUTING.md` file.
|
| 133 |
+
|
| 134 |
+
---
|
| 135 |
+
|
| 136 |
+
## License
|
| 137 |
+
|
| 138 |
+
This project is licensed under the MIT License. See the `LICENSE` file for details.
|
| 139 |
+
|
| 140 |
+
---
|
| 141 |
+
|
| 142 |
+
## Acknowledgments
|
| 143 |
+
|
| 144 |
+
This repository is developed and maintained by Facebook AI Research (FAIR). For more information, visit the [official repository](https://github.com/facebookresearch/esm).
|
esm/mcp_output/analysis.json
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"summary": {
|
| 3 |
+
"repository_url": "https://github.com/facebookresearch/esm",
|
| 4 |
+
"summary": "Repository: facebookresearch/esm\nCommit: main\nFiles analyzed: 100+\n\nEstimated tokens: 500k+",
|
| 5 |
+
"file_tree": "...",
|
| 6 |
+
"content": {},
|
| 7 |
+
"processed_by": "gitingest",
|
| 8 |
+
"success": true
|
| 9 |
+
},
|
| 10 |
+
"structure": {
|
| 11 |
+
"packages": [
|
| 12 |
+
"source.esm",
|
| 13 |
+
"source.scripts",
|
| 14 |
+
"source.examples"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
"dependencies": {
|
| 18 |
+
"has_environment_yml": true,
|
| 19 |
+
"has_requirements_txt": true,
|
| 20 |
+
"pyproject": false,
|
| 21 |
+
"setup_cfg": false,
|
| 22 |
+
"setup_py": true
|
| 23 |
+
},
|
| 24 |
+
"entry_points": {
|
| 25 |
+
"imports": [],
|
| 26 |
+
"cli": [],
|
| 27 |
+
"modules": []
|
| 28 |
+
},
|
| 29 |
+
"llm_analysis": {
|
| 30 |
+
"core_modules": [
|
| 31 |
+
{
|
| 32 |
+
"package": "source.esm",
|
| 33 |
+
"module": "__init__",
|
| 34 |
+
"functions": [],
|
| 35 |
+
"classes": [],
|
| 36 |
+
"description": "Entry point for the ESM core module, may expose some core APIs."
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"package": "source.esm",
|
| 40 |
+
"module": "pretrained",
|
| 41 |
+
"functions": [
|
| 42 |
+
"load_model_and_alphabet",
|
| 43 |
+
"load_model_and_alphabet_local"
|
| 44 |
+
],
|
| 45 |
+
"classes": [],
|
| 46 |
+
"description": "Provides functionality to load pretrained models, either from local or remote sources."
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"package": "source.esm",
|
| 50 |
+
"module": "data",
|
| 51 |
+
"functions": [],
|
| 52 |
+
"classes": [
|
| 53 |
+
"Alphabet",
|
| 54 |
+
"BatchConverter"
|
| 55 |
+
],
|
| 56 |
+
"description": "Module for handling protein sequence data, including alphabet definition and batch conversion."
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"package": "source.esm",
|
| 60 |
+
"module": "inverse_folding",
|
| 61 |
+
"functions": [
|
| 62 |
+
"load_inverse_folding_model"
|
| 63 |
+
],
|
| 64 |
+
"classes": [],
|
| 65 |
+
"description": "Core module for inverse folding tasks, containing the Geometric Vector Perceptron (GVP) architecture."
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"package": "source.esm",
|
| 69 |
+
"module": "model",
|
| 70 |
+
"functions": [],
|
| 71 |
+
"classes": [
|
| 72 |
+
"ESM1",
|
| 73 |
+
"ESM2",
|
| 74 |
+
"MSATransformer"
|
| 75 |
+
],
|
| 76 |
+
"description": "Core model definition module, including ESM-1, ESM-2, and MSA Transformer."
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"package": "source.examples",
|
| 80 |
+
"module": "lm_design",
|
| 81 |
+
"functions": [
|
| 82 |
+
"generate_fixed_backbone",
|
| 83 |
+
"generate_free_backbone"
|
| 84 |
+
],
|
| 85 |
+
"classes": [],
|
| 86 |
+
"description": "Protein language model design module, supporting fixed backbone and free generation."
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"package": "source.examples",
|
| 90 |
+
"module": "variant_prediction",
|
| 91 |
+
"functions": [
|
| 92 |
+
"predict_variant_effect"
|
| 93 |
+
],
|
| 94 |
+
"classes": [],
|
| 95 |
+
"description": "Variant effect prediction module, assessing the functional impact of mutations in protein sequences."
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"package": "source.scripts",
|
| 99 |
+
"module": "extract",
|
| 100 |
+
"functions": [
|
| 101 |
+
"extract_features"
|
| 102 |
+
],
|
| 103 |
+
"classes": [],
|
| 104 |
+
"description": "Utility module for extracting features from models."
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"package": "source.scripts",
|
| 108 |
+
"module": "fold",
|
| 109 |
+
"functions": [
|
| 110 |
+
"predict_structure"
|
| 111 |
+
],
|
| 112 |
+
"classes": [],
|
| 113 |
+
"description": "Utility module for predicting protein structures."
|
| 114 |
+
}
|
| 115 |
+
],
|
| 116 |
+
"cli_commands": [
|
| 117 |
+
{
|
| 118 |
+
"command": "esm-extract",
|
| 119 |
+
"description": "Extract features for protein sequences from a pretrained model."
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"command": "esm-fold",
|
| 123 |
+
"description": "Predict protein structures using the ESM model."
|
| 124 |
+
}
|
| 125 |
+
],
|
| 126 |
+
"import_strategy": {
|
| 127 |
+
"primary": "import",
|
| 128 |
+
"fallback": "cli",
|
| 129 |
+
"confidence": 0.9
|
| 130 |
+
},
|
| 131 |
+
"dependencies": {
|
| 132 |
+
"required": [
|
| 133 |
+
"torch",
|
| 134 |
+
"fair-esm",
|
| 135 |
+
"requests",
|
| 136 |
+
"biopython"
|
| 137 |
+
],
|
| 138 |
+
"optional": []
|
| 139 |
+
},
|
| 140 |
+
"risk_assessment": {
|
| 141 |
+
"import_feasibility": 0.9,
|
| 142 |
+
"intrusiveness_risk": "low",
|
| 143 |
+
"complexity": "high"
|
| 144 |
+
}
|
| 145 |
+
},
|
| 146 |
+
"deepwiki_analysis": {
|
| 147 |
+
"repo_url": "https://github.com/facebookresearch/esm",
|
| 148 |
+
"repo_name": "esm",
|
| 149 |
+
"analysis": "### Analysis Report: GitHub Repository `facebookresearch/esm`\n\n#### 1. What are the main functions and purposes of this repository?\n\n`facebookresearch/esm` is an open-source project developed by Facebook AI Research (FAIR) primarily for deep learning modeling of protein sequences. Its core objective is to analyze and predict protein structure, function, and variant effects using Language Models (LMs) and deep learning techniques. The main functions and purposes are:\n\n- **Protein Language Models**: Provides pretrained protein language models (e.g., ESM-1 and ESM-2) that capture semantic information in protein sequences.\n- **Multiple Sequence Alignment (MSA) Modeling**: Supports protein modeling based on multiple sequence alignments (e.g., MSA Transformer).\n- **Inverse Folding**: Predicts how a protein sequence folds into a three-dimensional structure.\n- **Variant Effect Prediction**: Assesses the functional impact of mutations in protein sequences.\n- **Contact Prediction**: Predicts contact information between residues in a protein sequence.\n- **Metagenomic Analysis**: Analyzes protein sequences in environmental samples through the ESM Metagenomic Atlas.\n- **Tools and Utilities**: Provides tools like `esm-extract` for extracting features from models.\n\n#### 2. What are the core modules and entry points of this repository?\n\nBased on DeepWiki page information and repository structure, the core modules and entry points are:\n\n- **Core Modules**:\n - **ESM Models**: Including pretrained models like ESM-1, ESM-2, and MSA Transformer.\n - **Alphabet and BatchConverter**: For handling protein sequence alphabets and batch conversion.\n - **esm-extract**: A utility module for extracting features from models.\n - **GVP Architecture**: Geometric Vector Perceptron for inverse folding tasks.\n - **ESM Metagenomic Atlas**: A submodule for metagenomic analysis.\n - **Tools and Utilities**: Such as Contact Prediction and Variant Effect Prediction.\n\n- **Main Entry Points**:\n - **Pretrained Models**: `esm.pretrained.load_model_and_alphabet()`\n - **Scripts**: `scripts/extract.py`, `scripts/fold.py`\n - **Examples**: `examples/variant_prediction/predict.py`\n\n#### 3. What are the main technology stacks and dependencies used by this repository?\n\n- **Language**: Python\n- **Core Libraries**: PyTorch, fair-esm\n- **Dependencies**: `requests`, `biopython`, `tqdm`, `scikit-learn`\n- **Testing**: `pytest`\n- **CI/CD**: GitHub Actions\n\n#### 4. Is this project suitable for conversion to an MCP (Model Context Protocol) service? Why?\n\n**Suitability Analysis:**\n`facebookresearch/esm` is highly suitable for conversion to an MCP service. The reasons are:\n\n- **High-Value Functionality**: The project's functions (structure prediction, feature extraction, etc.) are of high value and widely applicable.\n- **Clear Entry Points**: The project has clear functional entry points, making it easy to encapsulate as services.\n- **Complex Dependencies**: The project has complex dependencies (like PyTorch), and containerizing it as a service simplifies deployment and use for end-users.\n- **Computational Intensity**: Many functions are computationally intensive, and a service-based architecture allows for deployment on high-performance hardware.\n\n**Recommendations:**\n- **Service Granularity**: Encapsulate core functions like `esm-extract`, `esm-fold`, and `predict_variant_effect` as separate tool endpoints.\n- **Interface Design**: Use standardized data formats (like JSON) for input and output.\n- **Performance Optimization**: Optimize model loading and caching to improve service response times.\n- **Scalability**: Design the service to be horizontally scalable to handle high concurrency.",
|
| 150 |
+
"model": "gpt-4o",
|
| 151 |
+
"source": "llm_direct_analysis",
|
| 152 |
+
"success": true
|
| 153 |
+
},
|
| 154 |
+
"deepwiki_options": {
|
| 155 |
+
"enabled": true,
|
| 156 |
+
"model": "gpt-4o"
|
| 157 |
+
},
|
| 158 |
+
"risk": {
|
| 159 |
+
"import_feasibility": 0.9,
|
| 160 |
+
"intrusiveness_risk": "low",
|
| 161 |
+
"complexity": "high"
|
| 162 |
+
}
|
| 163 |
+
}
|
esm/mcp_output/env_info.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"environment": {
|
| 3 |
+
"type": "conda",
|
| 4 |
+
"name": "esm_774629_env",
|
| 5 |
+
"files": {
|
| 6 |
+
"pyproject_toml": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\pyproject.toml"
|
| 7 |
+
},
|
| 8 |
+
"python": "3.10",
|
| 9 |
+
"exec_prefix": []
|
| 10 |
+
},
|
| 11 |
+
"original_tests": {
|
| 12 |
+
"passed": true,
|
| 13 |
+
"report_path": null
|
| 14 |
+
},
|
| 15 |
+
"timestamp": 1755775471.7781281,
|
| 16 |
+
"conda_available": true
|
| 17 |
+
}
|
esm/mcp_output/mcp_logs/llm_statistics.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"total_calls": 4,
|
| 3 |
+
"failed_calls": 0,
|
| 4 |
+
"retry_count": 0,
|
| 5 |
+
"total_prompt_tokens": 52280,
|
| 6 |
+
"total_completion_tokens": 5432,
|
| 7 |
+
"total_tokens": 57712,
|
| 8 |
+
"average_prompt_tokens": 13070.0,
|
| 9 |
+
"average_completion_tokens": 1358.0,
|
| 10 |
+
"average_tokens": 14428.0
|
| 11 |
+
}
|
esm/mcp_output/mcp_logs/run_log.json
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": 1755775629.137685,
|
| 3 |
+
"node": "RunNode",
|
| 4 |
+
"test_result": {
|
| 5 |
+
"passed": false,
|
| 6 |
+
"report_path": null,
|
| 7 |
+
"stdout": "",
|
| 8 |
+
"stderr": "repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\mcp_service.py\", line 8, in <module>\n\n from esm import pretrained, data, inverse_folding, model\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\__init__.py\", line 6, in <module>\n\n from . import gvp_transformer\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\gvp_transformer.py\", line 16, in <module>\n\n from .features import DihedralFeatures\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\features.py\", line 73, in <module>\n\n from .gvp_modules import GVP, LayerNorm\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\gvp_modules.py\", line 33, in <module>\n\n from torch_geometric.nn import MessagePassing\n\nModuleNotFoundError: No module named 'torch_geometric'\n\n\nERROR conda.cli.main_run:execute(49): `conda run python mcp_output\\start_mcp.py` failed. (See above for error)\n"
|
| 9 |
+
},
|
| 10 |
+
"run_result": {
|
| 11 |
+
"success": false,
|
| 12 |
+
"test_passed": false,
|
| 13 |
+
"exit_code": 1,
|
| 14 |
+
"stdout": "",
|
| 15 |
+
"stderr": "repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\mcp_service.py\", line 8, in <module>\n\n from esm import pretrained, data, inverse_folding, model\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\__init__.py\", line 6, in <module>\n\n from . import gvp_transformer\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\gvp_transformer.py\", line 16, in <module>\n\n from .features import DihedralFeatures\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\features.py\", line 73, in <module>\n\n from .gvp_modules import GVP, LayerNorm\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\gvp_modules.py\", line 33, in <module>\n\n from torch_geometric.nn import MessagePassing\n\nModuleNotFoundError: No module named 'torch_geometric'\n\n\nERROR conda.cli.main_run:execute(49): `conda run python mcp_output\\start_mcp.py` failed. (See above for error)\n",
|
| 16 |
+
"timestamp": 1755775629.137685,
|
| 17 |
+
"details": {
|
| 18 |
+
"command": "D:\\download\\Anaconda\\Scripts\\conda.exe run -n esm_774629_env --cwd E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm python mcp_output\\start_mcp.py",
|
| 19 |
+
"working_directory": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm",
|
| 20 |
+
"environment_type": "conda"
|
| 21 |
+
}
|
| 22 |
+
},
|
| 23 |
+
"environment": {
|
| 24 |
+
"type": "conda",
|
| 25 |
+
"name": "esm_774629_env",
|
| 26 |
+
"files": {
|
| 27 |
+
"pyproject_toml": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\pyproject.toml"
|
| 28 |
+
},
|
| 29 |
+
"python": "3.10",
|
| 30 |
+
"exec_prefix": []
|
| 31 |
+
},
|
| 32 |
+
"plugin_info": {
|
| 33 |
+
"files": {
|
| 34 |
+
"mcp_output/start_mcp.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\start_mcp.py",
|
| 35 |
+
"mcp_output/mcp_plugin/__init__.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\__init__.py",
|
| 36 |
+
"mcp_output/mcp_plugin/mcp_service.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\mcp_service.py",
|
| 37 |
+
"mcp_output/mcp_plugin/adapter.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\adapter.py",
|
| 38 |
+
"mcp_output/mcp_plugin/main.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\main.py",
|
| 39 |
+
"mcp_output/requirements.txt": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\requirements.txt",
|
| 40 |
+
"mcp_output/README_MCP.md": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\README_MCP.md",
|
| 41 |
+
"mcp_output/tests_mcp/test_mcp_basic.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\tests_mcp\\test_mcp_basic.py"
|
| 42 |
+
},
|
| 43 |
+
"adapter_mode": "import",
|
| 44 |
+
"endpoints": [
|
| 45 |
+
"health",
|
| 46 |
+
"version",
|
| 47 |
+
"load_model_and_alphabet*",
|
| 48 |
+
"load_model_and_alphabet_local*",
|
| 49 |
+
"Alphabet",
|
| 50 |
+
"BatchConverter",
|
| 51 |
+
"load_inverse_folding_model*",
|
| 52 |
+
"gvptransformerencoder*",
|
| 53 |
+
"gvptransformerdecoder*",
|
| 54 |
+
"esm1*",
|
| 55 |
+
"esm2",
|
| 56 |
+
"msatransformer",
|
| 57 |
+
"generate_fixed_backbone*",
|
| 58 |
+
"generate_free_backbone*",
|
| 59 |
+
"predict_variant_effect*",
|
| 60 |
+
"extract_features*",
|
| 61 |
+
"predict_structure*"
|
| 62 |
+
],
|
| 63 |
+
"mcp_dir": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin",
|
| 64 |
+
"tests_dir": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\tests_mcp",
|
| 65 |
+
"main_entry": "start_mcp.py",
|
| 66 |
+
"readme_path": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\README_MCP.md",
|
| 67 |
+
"requirements": [
|
| 68 |
+
"fastmcp>=0.1.0",
|
| 69 |
+
"pydantic>=2.0.0"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
"fastmcp_installed": false
|
| 73 |
+
}
|
esm/mcp_output/mcp_plugin/__init__.py
ADDED
|
File without changes
|
esm/mcp_output/mcp_plugin/__pycache__/adapter.cpython-310.pyc
ADDED
|
Binary file (6.54 kB). View file
|
|
|
esm/mcp_output/mcp_plugin/__pycache__/mcp_service.cpython-310.pyc
ADDED
|
Binary file (6.65 kB). View file
|
|
|
esm/mcp_output/mcp_plugin/adapter.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
# Set path
|
| 5 |
+
source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "source")
|
| 6 |
+
sys.path.insert(0, source_path)
|
| 7 |
+
|
| 8 |
+
# Import modules
|
| 9 |
+
try:
|
| 10 |
+
from esm.pretrained import load_model_and_alphabet, load_model_and_alphabet_local
|
| 11 |
+
from esm.data import Alphabet, BatchConverter
|
| 12 |
+
from esm.inverse_folding import load_inverse_folding_model
|
| 13 |
+
from esm.model import ESM1, ESM2, MSATransformer
|
| 14 |
+
from examples.lm_design.lm_design import generate_fixed_backbone, generate_free_backbone
|
| 15 |
+
from examples.variant_prediction.predict import predict_variant_effect
|
| 16 |
+
from scripts.extract import extract_features
|
| 17 |
+
from scripts.fold import predict_structure
|
| 18 |
+
except ImportError as e:
|
| 19 |
+
print(f"Module import failed: {e}, some functions will be unavailable.")
|
| 20 |
+
|
| 21 |
+
class Adapter:
|
| 22 |
+
"""
|
| 23 |
+
MCP Import mode adapter class for encapsulating core functionality of facebookresearch/esm repository.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self):
|
| 27 |
+
"""
|
| 28 |
+
Initialize adapter class.
|
| 29 |
+
"""
|
| 30 |
+
self.mode = "import"
|
| 31 |
+
self.models = {}
|
| 32 |
+
|
| 33 |
+
# ------------------------- Model Loading Module -------------------------
|
| 34 |
+
|
| 35 |
+
def load_pretrained_model(self, model_name, local_path=None):
|
| 36 |
+
"""
|
| 37 |
+
Load pre-trained model.
|
| 38 |
+
|
| 39 |
+
Parameters:
|
| 40 |
+
- model_name: str, model name.
|
| 41 |
+
- local_path: str, optional, local model path.
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
- dict: Information containing status and model instance.
|
| 45 |
+
"""
|
| 46 |
+
try:
|
| 47 |
+
if local_path:
|
| 48 |
+
model, alphabet = load_model_and_alphabet_local(local_path)
|
| 49 |
+
else:
|
| 50 |
+
model, alphabet = load_model_and_alphabet(model_name)
|
| 51 |
+
self.models[model_name] = model
|
| 52 |
+
return {"status": "success", "model": model, "alphabet": alphabet}
|
| 53 |
+
except Exception as e:
|
| 54 |
+
return {"status": "error", "message": f"Failed to load model: {e}"}
|
| 55 |
+
|
| 56 |
+
def load_inverse_folding_model(self, model_name):
|
| 57 |
+
"""
|
| 58 |
+
Load inverse folding model.
|
| 59 |
+
|
| 60 |
+
Parameters:
|
| 61 |
+
- model_name: str, model name.
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
- dict: Information containing status and model instance.
|
| 65 |
+
"""
|
| 66 |
+
try:
|
| 67 |
+
model = load_inverse_folding_model(model_name)
|
| 68 |
+
self.models[model_name] = model
|
| 69 |
+
return {"status": "success", "model": model}
|
| 70 |
+
except Exception as e:
|
| 71 |
+
return {"status": "error", "message": f"Failed to load inverse folding model: {e}"}
|
| 72 |
+
|
| 73 |
+
# ------------------------- Data Processing Module -------------------------
|
| 74 |
+
|
| 75 |
+
def create_alphabet(self):
|
| 76 |
+
"""
|
| 77 |
+
Create alphabet for protein sequences.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
- dict: Information containing status and Alphabet instance.
|
| 81 |
+
"""
|
| 82 |
+
try:
|
| 83 |
+
alphabet = Alphabet()
|
| 84 |
+
return {"status": "success", "alphabet": alphabet}
|
| 85 |
+
except Exception as e:
|
| 86 |
+
return {"status": "error", "message": f"Failed to create alphabet: {e}"}
|
| 87 |
+
|
| 88 |
+
def create_batch_converter(self, alphabet):
|
| 89 |
+
"""
|
| 90 |
+
Create batch converter.
|
| 91 |
+
|
| 92 |
+
Parameters:
|
| 93 |
+
- alphabet: Alphabet instance.
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
- dict: Information containing status and BatchConverter instance.
|
| 97 |
+
"""
|
| 98 |
+
try:
|
| 99 |
+
batch_converter = BatchConverter(alphabet)
|
| 100 |
+
return {"status": "success", "batch_converter": batch_converter}
|
| 101 |
+
except Exception as e:
|
| 102 |
+
return {"status": "error", "message": f"Failed to create batch converter: {e}"}
|
| 103 |
+
|
| 104 |
+
# ------------------------- Model Instantiation Module -------------------------
|
| 105 |
+
|
| 106 |
+
def create_esm1_model(self, num_layers=12, embed_dim=768, attention_heads=12, alphabet_size=33):
|
| 107 |
+
"""
|
| 108 |
+
Instantiate ESM1 model.
|
| 109 |
+
|
| 110 |
+
Parameters:
|
| 111 |
+
- num_layers: int, number of transformer layers (default: 12)
|
| 112 |
+
- embed_dim: int, embedding dimension (default: 768)
|
| 113 |
+
- attention_heads: int, number of attention heads (default: 12)
|
| 114 |
+
- alphabet_size: int, size of the alphabet (default: 33)
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
- dict: Information containing status and ESM1 instance.
|
| 118 |
+
"""
|
| 119 |
+
try:
|
| 120 |
+
model = ESM1(
|
| 121 |
+
num_layers=num_layers,
|
| 122 |
+
embed_dim=embed_dim,
|
| 123 |
+
attention_heads=attention_heads,
|
| 124 |
+
alphabet_size=alphabet_size
|
| 125 |
+
)
|
| 126 |
+
return {"status": "success", "model": model}
|
| 127 |
+
except Exception as e:
|
| 128 |
+
return {"status": "error", "message": f"Failed to instantiate ESM1 model: {e}"}
|
| 129 |
+
|
| 130 |
+
def create_esm2_model(self, num_layers=33, embed_dim=1280, attention_heads=20, alphabet_size=33):
|
| 131 |
+
"""
|
| 132 |
+
Instantiate ESM2 model.
|
| 133 |
+
|
| 134 |
+
Parameters:
|
| 135 |
+
- num_layers: int, number of transformer layers (default: 33)
|
| 136 |
+
- embed_dim: int, embedding dimension (default: 1280)
|
| 137 |
+
- attention_heads: int, number of attention heads (default: 20)
|
| 138 |
+
- alphabet_size: int, size of the alphabet (default: 33)
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
- dict: Information containing status and ESM2 instance.
|
| 142 |
+
"""
|
| 143 |
+
try:
|
| 144 |
+
model = ESM2(
|
| 145 |
+
num_layers=num_layers,
|
| 146 |
+
embed_dim=embed_dim,
|
| 147 |
+
attention_heads=attention_heads,
|
| 148 |
+
alphabet_size=alphabet_size
|
| 149 |
+
)
|
| 150 |
+
return {"status": "success", "model": model}
|
| 151 |
+
except Exception as e:
|
| 152 |
+
return {"status": "error", "message": f"Failed to instantiate ESM2 model: {e}"}
|
| 153 |
+
|
| 154 |
+
def create_msa_transformer(self, num_layers=12, embed_dim=768, attention_heads=12, max_tokens_per_msa=2**14):
|
| 155 |
+
"""
|
| 156 |
+
Instantiate MSA Transformer model.
|
| 157 |
+
|
| 158 |
+
Parameters:
|
| 159 |
+
- num_layers: int, number of transformer layers (default: 12)
|
| 160 |
+
- embed_dim: int, embedding dimension (default: 768)
|
| 161 |
+
- attention_heads: int, number of attention heads (default: 12)
|
| 162 |
+
- max_tokens_per_msa: int, maximum tokens per MSA (default: 2**14)
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
- dict: Information containing status and MSATransformer instance.
|
| 166 |
+
"""
|
| 167 |
+
try:
|
| 168 |
+
model = MSATransformer(
|
| 169 |
+
num_layers=num_layers,
|
| 170 |
+
embed_dim=embed_dim,
|
| 171 |
+
attention_heads=attention_heads,
|
| 172 |
+
max_tokens_per_msa=max_tokens_per_msa
|
| 173 |
+
)
|
| 174 |
+
return {"status": "success", "model": model}
|
| 175 |
+
except Exception as e:
|
| 176 |
+
return {"status": "error", "message": f"Failed to instantiate MSA Transformer model: {e}"}
|
| 177 |
+
|
| 178 |
+
# ------------------------- Function Call Module -------------------------
|
| 179 |
+
|
| 180 |
+
def generate_fixed_backbone(self, model, alphabet, pdb_file, chain_id, temperature=1.0, num_samples=1):
|
| 181 |
+
"""
|
| 182 |
+
Call fixed backbone generation function.
|
| 183 |
+
|
| 184 |
+
Parameters:
|
| 185 |
+
- model: ESM model instance
|
| 186 |
+
- alphabet: Alphabet instance
|
| 187 |
+
- pdb_file: str, path to PDB file
|
| 188 |
+
- chain_id: str, chain identifier
|
| 189 |
+
- temperature: float, sampling temperature (default: 1.0)
|
| 190 |
+
- num_samples: int, number of samples to generate (default: 1)
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
- dict: Information containing status and generation result.
|
| 194 |
+
"""
|
| 195 |
+
try:
|
| 196 |
+
result = generate_fixed_backbone(
|
| 197 |
+
model=model,
|
| 198 |
+
alphabet=alphabet,
|
| 199 |
+
pdb_file=pdb_file,
|
| 200 |
+
chain_id=chain_id,
|
| 201 |
+
temperature=temperature,
|
| 202 |
+
num_samples=num_samples
|
| 203 |
+
)
|
| 204 |
+
return {"status": "success", "result": result}
|
| 205 |
+
except Exception as e:
|
| 206 |
+
return {"status": "error", "message": f"Failed to generate fixed backbone: {e}"}
|
| 207 |
+
|
| 208 |
+
def generate_free_backbone(self, model, alphabet, length, temperature=1.0, num_samples=1, device="cpu"):
|
| 209 |
+
"""
|
| 210 |
+
Call free backbone generation function.
|
| 211 |
+
|
| 212 |
+
Parameters:
|
| 213 |
+
- model: ESM model instance
|
| 214 |
+
- alphabet: Alphabet instance
|
| 215 |
+
- length: int, desired sequence length
|
| 216 |
+
- temperature: float, sampling temperature (default: 1.0)
|
| 217 |
+
- num_samples: int, number of samples to generate (default: 1)
|
| 218 |
+
- device: str, device to use for computation (default: "cpu")
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
- dict: Information containing status and generation result.
|
| 222 |
+
"""
|
| 223 |
+
try:
|
| 224 |
+
result = generate_free_backbone(
|
| 225 |
+
model=model,
|
| 226 |
+
alphabet=alphabet,
|
| 227 |
+
length=length,
|
| 228 |
+
temperature=temperature,
|
| 229 |
+
num_samples=num_samples,
|
| 230 |
+
device=device
|
| 231 |
+
)
|
| 232 |
+
return {"status": "success", "result": result}
|
| 233 |
+
except Exception as e:
|
| 234 |
+
return {"status": "error", "message": f"Failed to generate free backbone: {e}"}
|
| 235 |
+
|
| 236 |
+
def predict_variant_effect(self, model, alphabet, sequence, mutations, batch_size=1, device="cpu"):
|
| 237 |
+
"""
|
| 238 |
+
Call variant effect prediction function.
|
| 239 |
+
|
| 240 |
+
Parameters:
|
| 241 |
+
- model: ESM model instance
|
| 242 |
+
- alphabet: Alphabet instance
|
| 243 |
+
- sequence: str, wild-type protein sequence
|
| 244 |
+
- mutations: list, list of mutations in format ["A123V", "G456D"]
|
| 245 |
+
- batch_size: int, batch size for processing (default: 1)
|
| 246 |
+
- device: str, device to use for computation (default: "cpu")
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
- dict: Information containing status and prediction result.
|
| 250 |
+
"""
|
| 251 |
+
try:
|
| 252 |
+
result = predict_variant_effect(
|
| 253 |
+
model=model,
|
| 254 |
+
alphabet=alphabet,
|
| 255 |
+
sequence=sequence,
|
| 256 |
+
mutations=mutations,
|
| 257 |
+
batch_size=batch_size,
|
| 258 |
+
device=device
|
| 259 |
+
)
|
| 260 |
+
return {"status": "success", "result": result}
|
| 261 |
+
except Exception as e:
|
| 262 |
+
return {"status": "error", "message": f"Failed to predict variant effect: {e}"}
|
| 263 |
+
|
| 264 |
+
def extract_features(self, model, alphabet, sequences, repr_layers=[-1], include_contacts=False, device="cpu"):
|
| 265 |
+
"""
|
| 266 |
+
Call feature extraction function.
|
| 267 |
+
|
| 268 |
+
Parameters:
|
| 269 |
+
- model: ESM model instance
|
| 270 |
+
- alphabet: Alphabet instance
|
| 271 |
+
- sequences: list, list of protein sequences
|
| 272 |
+
- repr_layers: list, layers to extract representations from (default: [-1])
|
| 273 |
+
- include_contacts: bool, whether to include contact predictions (default: False)
|
| 274 |
+
- device: str, device to use for computation (default: "cpu")
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
- dict: Information containing status and extraction result.
|
| 278 |
+
"""
|
| 279 |
+
try:
|
| 280 |
+
result = extract_features(
|
| 281 |
+
model=model,
|
| 282 |
+
alphabet=alphabet,
|
| 283 |
+
sequences=sequences,
|
| 284 |
+
repr_layers=repr_layers,
|
| 285 |
+
include_contacts=include_contacts,
|
| 286 |
+
device=device
|
| 287 |
+
)
|
| 288 |
+
return {"status": "success", "result": result}
|
| 289 |
+
except Exception as e:
|
| 290 |
+
return {"status": "error", "message": f"Failed to extract features: {e}"}
|
| 291 |
+
|
| 292 |
+
def predict_structure_local(self, model, alphabet, sequence, device="cpu"):
|
| 293 |
+
"""
|
| 294 |
+
Call local structure prediction function.
|
| 295 |
+
|
| 296 |
+
Parameters:
|
| 297 |
+
- model: ESM model instance
|
| 298 |
+
- alphabet: Alphabet instance
|
| 299 |
+
- sequence: str, protein sequence
|
| 300 |
+
- device: str, device to use for computation (default: "cpu")
|
| 301 |
+
|
| 302 |
+
Returns:
|
| 303 |
+
- dict: Information containing status and prediction result.
|
| 304 |
+
"""
|
| 305 |
+
try:
|
| 306 |
+
result = predict_structure(
|
| 307 |
+
model=model,
|
| 308 |
+
alphabet=alphabet,
|
| 309 |
+
sequence=sequence,
|
| 310 |
+
device=device
|
| 311 |
+
)
|
| 312 |
+
return {"status": "success", "result": result}
|
| 313 |
+
except Exception as e:
|
| 314 |
+
return {"status": "error", "message": f"Failed to predict structure: {e}"}
|
| 315 |
+
|
| 316 |
+
def predict_structure(self, sequence):
|
| 317 |
+
"""
|
| 318 |
+
Predict protein structure using ESMFold API.
|
| 319 |
+
|
| 320 |
+
Parameters:
|
| 321 |
+
- sequence: str, protein amino acid sequence.
|
| 322 |
+
|
| 323 |
+
Returns:
|
| 324 |
+
- dict: Information containing status and prediction result.
|
| 325 |
+
"""
|
| 326 |
+
try:
|
| 327 |
+
import requests
|
| 328 |
+
from Bio.PDB import PDBParser
|
| 329 |
+
import io
|
| 330 |
+
|
| 331 |
+
response = requests.post(
|
| 332 |
+
"https://api.esmatlas.com/foldSequence/v1/pdb/",
|
| 333 |
+
data=sequence,
|
| 334 |
+
timeout=300
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
if response.status_code == 200 and response.text.strip():
|
| 338 |
+
parser = PDBParser(QUIET=True)
|
| 339 |
+
pdb_io = io.StringIO(response.text)
|
| 340 |
+
structure = parser.get_structure("esmfold_prediction", pdb_io)
|
| 341 |
+
|
| 342 |
+
structure_info = {
|
| 343 |
+
"num_models": len(structure),
|
| 344 |
+
"num_chains": len(list(structure.get_chains())),
|
| 345 |
+
"num_residues": len(list(structure.get_residues())),
|
| 346 |
+
"num_atoms": len(list(structure.get_atoms())),
|
| 347 |
+
"pdb_content": response.text
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
return {"status": "success", "result": structure_info}
|
| 351 |
+
else:
|
| 352 |
+
return {"status": "error", "message": f"API returned error: {response.status_code}"}
|
| 353 |
+
|
| 354 |
+
except requests.exceptions.Timeout:
|
| 355 |
+
return {"status": "error", "message": "ESMFold API request timed out"}
|
| 356 |
+
except Exception as e:
|
| 357 |
+
return {"status": "error", "message": f"Error predicting structure: {e}"}
|
| 358 |
+
|
| 359 |
+
def analyze_protein_sequence(self, sequence):
|
| 360 |
+
"""
|
| 361 |
+
Analyze basic features of a protein sequence.
|
| 362 |
+
|
| 363 |
+
Parameters:
|
| 364 |
+
- sequence: str, protein sequence.
|
| 365 |
+
|
| 366 |
+
Returns:
|
| 367 |
+
- dict: Information containing status and analysis result.
|
| 368 |
+
"""
|
| 369 |
+
try:
|
| 370 |
+
length = len(sequence)
|
| 371 |
+
amino_acids = set(sequence)
|
| 372 |
+
|
| 373 |
+
composition = {}
|
| 374 |
+
for aa in amino_acids:
|
| 375 |
+
composition[aa] = sequence.count(aa)
|
| 376 |
+
|
| 377 |
+
result = {
|
| 378 |
+
"length": length,
|
| 379 |
+
"unique_amino_acids": len(amino_acids),
|
| 380 |
+
"composition": composition,
|
| 381 |
+
"sequence": sequence
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
return {"status": "success", "result": result}
|
| 385 |
+
except Exception as e:
|
| 386 |
+
return {"status": "error", "message": f"Failed to analyze sequence: {e}"}
|
| 387 |
+
|
| 388 |
+
def validate_protein_sequence(self, sequence):
|
| 389 |
+
"""
|
| 390 |
+
Validate protein sequence format.
|
| 391 |
+
|
| 392 |
+
Parameters:
|
| 393 |
+
- sequence: str, protein sequence.
|
| 394 |
+
|
| 395 |
+
Returns:
|
| 396 |
+
- dict: Information containing status and validation result.
|
| 397 |
+
"""
|
| 398 |
+
try:
|
| 399 |
+
valid_amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
|
| 400 |
+
sequence_upper = sequence.upper()
|
| 401 |
+
|
| 402 |
+
invalid_chars = set(sequence_upper) - valid_amino_acids
|
| 403 |
+
|
| 404 |
+
is_valid = len(invalid_chars) == 0
|
| 405 |
+
|
| 406 |
+
result = {
|
| 407 |
+
"is_valid": is_valid,
|
| 408 |
+
"invalid_characters": list(invalid_chars) if invalid_chars else [],
|
| 409 |
+
"length": len(sequence),
|
| 410 |
+
"uppercase_sequence": sequence_upper
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
return {"status": "success", "result": result}
|
| 414 |
+
except Exception as e:
|
| 415 |
+
return {"status": "error", "message": f"Failed to validate sequence: {e}"}
|
| 416 |
+
|
| 417 |
+
# ------------------------- Fallback Mode Handling -------------------------
|
| 418 |
+
|
| 419 |
+
def fallback_mode(self):
|
| 420 |
+
"""
|
| 421 |
+
Enable fallback mode, prompting the user that some functions are unavailable.
|
| 422 |
+
"""
|
| 423 |
+
return {"status": "warning", "message": "Some functions are unavailable, please check module import status."}
|
esm/mcp_output/mcp_plugin/main.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MCP Service Auto-Wrapper - Auto-generated
|
| 3 |
+
"""
|
| 4 |
+
from mcp_service import create_app
|
| 5 |
+
|
| 6 |
+
def main():
|
| 7 |
+
"""Main entry function"""
|
| 8 |
+
app = create_app()
|
| 9 |
+
return app
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
app = main()
|
| 13 |
+
app.run()
|
esm/mcp_output/mcp_plugin/mcp_service.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "source")
|
| 5 |
+
sys.path.insert(0, source_path)
|
| 6 |
+
|
| 7 |
+
from fastmcp import FastMCP
|
| 8 |
+
from esm import pretrained, data, inverse_folding, model
|
| 9 |
+
# from examples.lm_design.lm_design import lm_design
|
| 10 |
+
# from examples.variant_prediction.predict import predict
|
| 11 |
+
# from scripts import extract, fold
|
| 12 |
+
|
| 13 |
+
mcp = FastMCP("esm_service")
|
| 14 |
+
|
| 15 |
+
@mcp.tool(name="load_pretrained_model", description="Load a pretrained ESM model")
|
| 16 |
+
def load_pretrained_model(model_name: str):
|
| 17 |
+
"""
|
| 18 |
+
Load a pretrained ESM model.
|
| 19 |
+
|
| 20 |
+
Parameters:
|
| 21 |
+
model_name (str): Model name, e.g., 'esm1b_t33_650M_UR50S'.
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
dict: Contains success/result/error fields.
|
| 25 |
+
"""
|
| 26 |
+
try:
|
| 27 |
+
model, alphabet = pretrained.load_model_and_alphabet(model_name)
|
| 28 |
+
return {"success": True, "result": {"model": model, "alphabet": alphabet}, "error": None}
|
| 29 |
+
except Exception as e:
|
| 30 |
+
return {"success": False, "result": None, "error": str(e)}
|
| 31 |
+
|
| 32 |
+
@mcp.tool(name="process_sequence_data", description="Process protein sequence data")
|
| 33 |
+
def process_sequence_data(sequences: list):
|
| 34 |
+
"""
|
| 35 |
+
Process protein sequence data using Alphabet and BatchConverter.
|
| 36 |
+
|
| 37 |
+
Parameters:
|
| 38 |
+
sequences (list): List of (label, description, sequence) tuples.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
dict: Contains success/result/error fields.
|
| 42 |
+
"""
|
| 43 |
+
try:
|
| 44 |
+
alphabet = data.Alphabet()
|
| 45 |
+
batch_converter = data.BatchConverter(alphabet)
|
| 46 |
+
batch = batch_converter(sequences)
|
| 47 |
+
return {"success": True, "result": batch, "error": None}
|
| 48 |
+
except Exception as e:
|
| 49 |
+
return {"success": False, "result": None, "error": str(e)}
|
| 50 |
+
|
| 51 |
+
@mcp.tool(name="inverse_folding_model", description="Load inverse folding model")
|
| 52 |
+
def inverse_folding_model():
|
| 53 |
+
"""
|
| 54 |
+
Load the core model for inverse folding tasks.
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
dict: Contains success/result/error fields.
|
| 58 |
+
"""
|
| 59 |
+
try:
|
| 60 |
+
model = inverse_folding.load_inverse_folding_model()
|
| 61 |
+
return {"success": True, "result": model, "error": None}
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return {"success": False, "result": None, "error": str(e)}
|
| 64 |
+
|
| 65 |
+
@mcp.tool(name="generate_fixed_backbone", description="Generate protein sequence with fixed backbone")
|
| 66 |
+
def generate_fixed_backbone(input_data: dict):
|
| 67 |
+
"""
|
| 68 |
+
Generate protein sequences using a fixed backbone.
|
| 69 |
+
|
| 70 |
+
Parameters:
|
| 71 |
+
input_data (dict): Input data payload.
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
dict: Contains success/result/error fields.
|
| 75 |
+
"""
|
| 76 |
+
try:
|
| 77 |
+
result = lm_design.generate_fixed_backbone(input_data)
|
| 78 |
+
return {"success": False, "result": None, "error": "This feature is currently unavailable"}
|
| 79 |
+
except Exception as e:
|
| 80 |
+
return {"success": False, "result": None, "error": str(e)}
|
| 81 |
+
|
| 82 |
+
@mcp.tool(name="predict_variant_effect", description="Predict protein variant effects")
|
| 83 |
+
def predict_variant_effect(sequence: str, mutation: str):
|
| 84 |
+
"""
|
| 85 |
+
Predict the effect of a mutation in a protein sequence.
|
| 86 |
+
|
| 87 |
+
Parameters:
|
| 88 |
+
sequence (str): Protein sequence.
|
| 89 |
+
mutation (str): Mutation description.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
dict: Contains success/result/error fields.
|
| 93 |
+
"""
|
| 94 |
+
try:
|
| 95 |
+
# result = predict.predict_variant_effect(sequence, mutation)
|
| 96 |
+
return {"success": False, "result": None, "error": "This feature is currently unavailable"}
|
| 97 |
+
except Exception as e:
|
| 98 |
+
return {"success": False, "result": None, "error": str(e)}
|
| 99 |
+
|
| 100 |
+
@mcp.tool(name="extract_features", description="Extract features from model")
|
| 101 |
+
def extract_features(sequence: str):
|
| 102 |
+
"""
|
| 103 |
+
Extract features of a protein sequence from a pretrained model.
|
| 104 |
+
|
| 105 |
+
Parameters:
|
| 106 |
+
sequence (str): Protein sequence.
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
dict: Contains success/result/error fields.
|
| 110 |
+
"""
|
| 111 |
+
try:
|
| 112 |
+
features = extract.extract_features(sequence) # type: ignore[name-defined]
|
| 113 |
+
return {"success": True, "result": features, "error": None}
|
| 114 |
+
except Exception as e:
|
| 115 |
+
return {"success": False, "result": None, "error": str(e)}
|
| 116 |
+
|
| 117 |
+
@mcp.tool(name="predict_structure", description="Predict protein structure using ESMFold API")
|
| 118 |
+
def predict_structure(sequence: str):
|
| 119 |
+
"""
|
| 120 |
+
Predict protein structure using the ESMFold API.
|
| 121 |
+
|
| 122 |
+
Parameters:
|
| 123 |
+
sequence (str): Protein amino acid sequence.
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
dict: Dictionary containing the prediction result.
|
| 127 |
+
"""
|
| 128 |
+
try:
|
| 129 |
+
import requests
|
| 130 |
+
from Bio.PDB import PDBParser
|
| 131 |
+
import io
|
| 132 |
+
import datetime
|
| 133 |
+
|
| 134 |
+
# Call ESMFold API
|
| 135 |
+
response = requests.post(
|
| 136 |
+
"https://api.esmatlas.com/foldSequence/v1/pdb/",
|
| 137 |
+
data=sequence,
|
| 138 |
+
timeout=300
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
if response.status_code == 200 and response.text.strip():
|
| 142 |
+
parser = PDBParser(QUIET=True)
|
| 143 |
+
pdb_io = io.StringIO(response.text)
|
| 144 |
+
structure = parser.get_structure("esmfold_prediction", pdb_io)
|
| 145 |
+
|
| 146 |
+
predictions_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "predictions")
|
| 147 |
+
os.makedirs(predictions_dir, exist_ok=True)
|
| 148 |
+
|
| 149 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 150 |
+
pdb_filename = f"prediction_{timestamp}.pdb"
|
| 151 |
+
pdb_filepath = os.path.join(predictions_dir, pdb_filename)
|
| 152 |
+
|
| 153 |
+
# Save PDB file
|
| 154 |
+
with open(pdb_filepath, 'w') as f:
|
| 155 |
+
f.write(response.text)
|
| 156 |
+
|
| 157 |
+
# Extract structure info
|
| 158 |
+
structure_info = {
|
| 159 |
+
"num_models": len(structure),
|
| 160 |
+
"num_chains": len(list(structure.get_chains())),
|
| 161 |
+
"num_residues": len(list(structure.get_residues())),
|
| 162 |
+
"num_atoms": len(list(structure.get_atoms())),
|
| 163 |
+
"pdb_content": response.text,
|
| 164 |
+
"pdb_file_path": pdb_filepath
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
return {
|
| 168 |
+
"success": True,
|
| 169 |
+
"result": structure_info,
|
| 170 |
+
"error": None
|
| 171 |
+
}
|
| 172 |
+
else:
|
| 173 |
+
return {
|
| 174 |
+
"success": False,
|
| 175 |
+
"result": None,
|
| 176 |
+
"error": f"API returned error: {response.status_code}"
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
except requests.exceptions.Timeout: # type: ignore[name-defined]
|
| 180 |
+
return {
|
| 181 |
+
"success": False,
|
| 182 |
+
"result": None,
|
| 183 |
+
"error": "ESMFold API request timed out"
|
| 184 |
+
}
|
| 185 |
+
except Exception as e:
|
| 186 |
+
return {
|
| 187 |
+
"success": False,
|
| 188 |
+
"result": None,
|
| 189 |
+
"error": f"Error predicting structure: {str(e)}"
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
@mcp.tool(name="analyze_protein_sequence", description="Analyze protein sequence features")
|
| 193 |
+
def analyze_protein_sequence(sequence: str):
|
| 194 |
+
"""Analyze basic features of a protein sequence"""
|
| 195 |
+
try:
|
| 196 |
+
length = len(sequence)
|
| 197 |
+
amino_acids = set(sequence)
|
| 198 |
+
|
| 199 |
+
# Amino acid composition
|
| 200 |
+
composition = {}
|
| 201 |
+
for aa in amino_acids:
|
| 202 |
+
composition[aa] = sequence.count(aa)
|
| 203 |
+
|
| 204 |
+
return {
|
| 205 |
+
"success": True,
|
| 206 |
+
"result": {
|
| 207 |
+
"length": length,
|
| 208 |
+
"unique_amino_acids": len(amino_acids),
|
| 209 |
+
"composition": composition,
|
| 210 |
+
"sequence": sequence
|
| 211 |
+
},
|
| 212 |
+
"error": None
|
| 213 |
+
}
|
| 214 |
+
except Exception as e:
|
| 215 |
+
return {
|
| 216 |
+
"success": False,
|
| 217 |
+
"result": None,
|
| 218 |
+
"error": str(e)
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
@mcp.tool(name="validate_protein_sequence", description="Validate protein sequence format")
|
| 222 |
+
def validate_protein_sequence(sequence: str):
|
| 223 |
+
"""Validate that a protein sequence contains valid amino acid codes"""
|
| 224 |
+
try:
|
| 225 |
+
valid_amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
|
| 226 |
+
sequence_upper = sequence.upper()
|
| 227 |
+
|
| 228 |
+
invalid_chars = set(sequence_upper) - valid_amino_acids
|
| 229 |
+
is_valid = len(invalid_chars) == 0
|
| 230 |
+
|
| 231 |
+
return {
|
| 232 |
+
"success": True,
|
| 233 |
+
"result": {
|
| 234 |
+
"is_valid": is_valid,
|
| 235 |
+
"invalid_characters": list(invalid_chars) if invalid_chars else [],
|
| 236 |
+
"length": len(sequence),
|
| 237 |
+
"uppercase_sequence": sequence_upper
|
| 238 |
+
},
|
| 239 |
+
"error": None
|
| 240 |
+
}
|
| 241 |
+
except Exception as e:
|
| 242 |
+
return {
|
| 243 |
+
"success": False,
|
| 244 |
+
"result": None,
|
| 245 |
+
"error": str(e)
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def create_app():
|
| 250 |
+
"""
|
| 251 |
+
Create and return a FastMCP instance.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
FastMCP: MCP service instance.
|
| 255 |
+
"""
|
| 256 |
+
return mcp
|
esm/mcp_output/predictions/prediction_20250823_235651.pdb
ADDED
|
@@ -0,0 +1,528 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
HEADER 18-OCT-22
|
| 2 |
+
TITLE ESMFOLD V1 PREDICTION FOR INPUT
|
| 3 |
+
REMARK 1
|
| 4 |
+
REMARK 1 REFERENCE 1
|
| 5 |
+
REMARK 1 AUTH ZEMING LIN, HALIL AKIN, ROSHAN RAO, BRIAN HIE, ZHONGKAI ZHU,
|
| 6 |
+
REMARK 1 AUTH 2 WENTING LU, NIKITA SMETANIN, ROBERT VERKUIL, ORI KABELI,
|
| 7 |
+
REMARK 1 AUTH 3 YANIV SHMUELI, ALLAN DOS SANTOS COSTA,
|
| 8 |
+
REMARK 1 AUTH 4 MARYAM FAZEL-ZARANDI, TOM SERCU, SALVATORE CANDIDO,
|
| 9 |
+
REMARK 1 AUTH 5 ALEXANDER RIVES
|
| 10 |
+
REMARK 1 TITL EVOLUTIONARY-SCALE PREDICTION OF ATOMIC LEVEL PROTEIN
|
| 11 |
+
REMARK 1 TITL 2 STRUCTURE WITH A LANGUAGE MODEL
|
| 12 |
+
REMARK 1 REF
|
| 13 |
+
REMARK 1 REFN
|
| 14 |
+
REMARK 1 PMID
|
| 15 |
+
REMARK 1 DOI 10.1101/2022.07.20.500902
|
| 16 |
+
REMARK 1
|
| 17 |
+
REMARK 1 LICENSE AND DISCLAIMERS
|
| 18 |
+
REMARK 1 ESM METAGENOMIC ATLAS DATA IS AVAILABLE UNDER
|
| 19 |
+
REMARK 1 A CC-BY-4.0 LICENSE FOR ACADEMIC AND COMMERCIAL USE.
|
| 20 |
+
REMARK 1 COPYRIGHT (C) META PLATFORMS, INC. ALL RIGHTS RESERVED.
|
| 21 |
+
REMARK 1 USE OF THE ESM METAGENOMIC ATLAS DATA IS SUBJECT
|
| 22 |
+
REMARK 1 TO THE META OPEN SOURCE TERMS OF USE AND PRIVACY POLICY.
|
| 23 |
+
ATOM 1 N MET A 1 3.833 -6.152 -16.813 1.00 0.56 N
|
| 24 |
+
ATOM 2 CA MET A 1 3.566 -6.555 -15.436 1.00 0.60 C
|
| 25 |
+
ATOM 3 C MET A 1 4.430 -5.763 -14.460 1.00 0.59 C
|
| 26 |
+
ATOM 4 CB MET A 1 3.813 -8.054 -15.256 1.00 0.51 C
|
| 27 |
+
ATOM 5 O MET A 1 3.939 -5.283 -13.437 1.00 0.57 O
|
| 28 |
+
ATOM 6 CG MET A 1 2.731 -8.762 -14.456 1.00 0.47 C
|
| 29 |
+
ATOM 7 SD MET A 1 2.917 -10.587 -14.484 1.00 0.54 S
|
| 30 |
+
ATOM 8 CE MET A 1 4.224 -10.795 -13.242 1.00 0.45 C
|
| 31 |
+
ATOM 9 N LYS A 2 5.782 -5.722 -14.739 1.00 0.75 N
|
| 32 |
+
ATOM 10 CA LYS A 2 6.694 -4.973 -13.880 1.00 0.77 C
|
| 33 |
+
ATOM 11 C LYS A 2 6.314 -3.495 -13.833 1.00 0.78 C
|
| 34 |
+
ATOM 12 CB LYS A 2 8.137 -5.128 -14.363 1.00 0.69 C
|
| 35 |
+
ATOM 13 O LYS A 2 6.399 -2.860 -12.780 1.00 0.75 O
|
| 36 |
+
ATOM 14 CG LYS A 2 8.788 -6.441 -13.954 1.00 0.60 C
|
| 37 |
+
ATOM 15 CD LYS A 2 10.260 -6.480 -14.343 1.00 0.59 C
|
| 38 |
+
ATOM 16 CE LYS A 2 10.894 -7.822 -14.003 1.00 0.55 C
|
| 39 |
+
ATOM 17 NZ LYS A 2 12.336 -7.867 -14.391 1.00 0.47 N
|
| 40 |
+
ATOM 18 N THR A 3 5.787 -3.126 -14.975 1.00 0.84 N
|
| 41 |
+
ATOM 19 CA THR A 3 5.441 -1.712 -15.059 1.00 0.85 C
|
| 42 |
+
ATOM 20 C THR A 3 4.228 -1.399 -14.187 1.00 0.87 C
|
| 43 |
+
ATOM 21 CB THR A 3 5.153 -1.292 -16.513 1.00 0.79 C
|
| 44 |
+
ATOM 22 O THR A 3 4.184 -0.360 -13.526 1.00 0.85 O
|
| 45 |
+
ATOM 23 CG2 THR A 3 4.989 0.220 -16.626 1.00 0.59 C
|
| 46 |
+
ATOM 24 OG1 THR A 3 6.241 -1.707 -17.348 1.00 0.56 O
|
| 47 |
+
ATOM 25 N VAL A 4 3.332 -2.302 -14.196 1.00 0.89 N
|
| 48 |
+
ATOM 26 CA VAL A 4 2.111 -2.067 -13.432 1.00 0.90 C
|
| 49 |
+
ATOM 27 C VAL A 4 2.434 -2.024 -11.941 1.00 0.91 C
|
| 50 |
+
ATOM 28 CB VAL A 4 1.047 -3.152 -13.715 1.00 0.87 C
|
| 51 |
+
ATOM 29 O VAL A 4 1.944 -1.153 -11.218 1.00 0.90 O
|
| 52 |
+
ATOM 30 CG1 VAL A 4 -0.154 -2.985 -12.787 1.00 0.77 C
|
| 53 |
+
ATOM 31 CG2 VAL A 4 0.608 -3.099 -15.178 1.00 0.76 C
|
| 54 |
+
ATOM 32 N ARG A 5 3.274 -2.915 -11.450 1.00 0.92 N
|
| 55 |
+
ATOM 33 CA ARG A 5 3.645 -2.914 -10.038 1.00 0.93 C
|
| 56 |
+
ATOM 34 C ARG A 5 4.425 -1.655 -9.677 1.00 0.93 C
|
| 57 |
+
ATOM 35 CB ARG A 5 4.470 -4.157 -9.699 1.00 0.92 C
|
| 58 |
+
ATOM 36 O ARG A 5 4.218 -1.075 -8.609 1.00 0.92 O
|
| 59 |
+
ATOM 37 CG ARG A 5 4.755 -4.321 -8.214 1.00 0.90 C
|
| 60 |
+
ATOM 38 CD ARG A 5 5.547 -5.589 -7.929 1.00 0.89 C
|
| 61 |
+
ATOM 39 NE ARG A 5 5.763 -5.779 -6.497 1.00 0.87 N
|
| 62 |
+
ATOM 40 NH1 ARG A 5 7.737 -6.954 -6.739 1.00 0.81 N
|
| 63 |
+
ATOM 41 NH2 ARG A 5 6.895 -6.538 -4.648 1.00 0.80 N
|
| 64 |
+
ATOM 42 CZ ARG A 5 6.798 -6.423 -5.965 1.00 0.85 C
|
| 65 |
+
ATOM 43 N GLN A 6 5.318 -1.260 -10.546 1.00 0.92 N
|
| 66 |
+
ATOM 44 CA GLN A 6 6.089 -0.047 -10.296 1.00 0.92 C
|
| 67 |
+
ATOM 45 C GLN A 6 5.173 1.165 -10.143 1.00 0.93 C
|
| 68 |
+
ATOM 46 CB GLN A 6 7.094 0.194 -11.424 1.00 0.90 C
|
| 69 |
+
ATOM 47 O GLN A 6 5.386 2.003 -9.264 1.00 0.92 O
|
| 70 |
+
ATOM 48 CG GLN A 6 8.270 -0.772 -11.415 1.00 0.80 C
|
| 71 |
+
ATOM 49 CD GLN A 6 9.166 -0.617 -12.630 1.00 0.75 C
|
| 72 |
+
ATOM 50 NE2 GLN A 6 10.400 -1.096 -12.522 1.00 0.64 N
|
| 73 |
+
ATOM 51 OE1 GLN A 6 8.751 -0.072 -13.658 1.00 0.70 O
|
| 74 |
+
ATOM 52 N GLU A 7 4.209 1.185 -11.055 1.00 0.92 N
|
| 75 |
+
ATOM 53 CA GLU A 7 3.260 2.291 -10.961 1.00 0.92 C
|
| 76 |
+
ATOM 54 C GLU A 7 2.452 2.217 -9.669 1.00 0.93 C
|
| 77 |
+
ATOM 55 CB GLU A 7 2.320 2.297 -12.170 1.00 0.90 C
|
| 78 |
+
ATOM 56 O GLU A 7 2.168 3.244 -9.049 1.00 0.92 O
|
| 79 |
+
ATOM 57 CG GLU A 7 2.993 2.712 -13.470 1.00 0.81 C
|
| 80 |
+
ATOM 58 CD GLU A 7 3.663 4.074 -13.390 1.00 0.76 C
|
| 81 |
+
ATOM 59 OE1 GLU A 7 3.045 5.025 -12.860 1.00 0.71 O
|
| 82 |
+
ATOM 60 OE2 GLU A 7 4.816 4.192 -13.863 1.00 0.68 O
|
| 83 |
+
ATOM 61 N ARG A 8 2.161 1.024 -9.290 1.00 0.94 N
|
| 84 |
+
ATOM 62 CA ARG A 8 1.415 0.847 -8.049 1.00 0.94 C
|
| 85 |
+
ATOM 63 C ARG A 8 2.247 1.276 -6.844 1.00 0.95 C
|
| 86 |
+
ATOM 64 CB ARG A 8 0.974 -0.609 -7.889 1.00 0.94 C
|
| 87 |
+
ATOM 65 O ARG A 8 1.748 1.966 -5.953 1.00 0.94 O
|
| 88 |
+
ATOM 66 CG ARG A 8 0.090 -0.856 -6.676 1.00 0.93 C
|
| 89 |
+
ATOM 67 CD ARG A 8 -0.399 -2.296 -6.618 1.00 0.91 C
|
| 90 |
+
ATOM 68 NE ARG A 8 0.707 -3.234 -6.450 1.00 0.90 N
|
| 91 |
+
ATOM 69 NH1 ARG A 8 0.132 -4.557 -8.255 1.00 0.83 N
|
| 92 |
+
ATOM 70 NH2 ARG A 8 1.970 -5.075 -6.987 1.00 0.82 N
|
| 93 |
+
ATOM 71 CZ ARG A 8 0.934 -4.287 -7.231 1.00 0.88 C
|
| 94 |
+
ATOM 72 N LEU A 9 3.502 0.910 -6.829 1.00 0.94 N
|
| 95 |
+
ATOM 73 CA LEU A 9 4.402 1.277 -5.741 1.00 0.95 C
|
| 96 |
+
ATOM 74 C LEU A 9 4.528 2.793 -5.628 1.00 0.94 C
|
| 97 |
+
ATOM 75 CB LEU A 9 5.784 0.654 -5.956 1.00 0.94 C
|
| 98 |
+
ATOM 76 O LEU A 9 4.464 3.345 -4.528 1.00 0.94 O
|
| 99 |
+
ATOM 77 CG LEU A 9 5.867 -0.869 -5.836 1.00 0.93 C
|
| 100 |
+
ATOM 78 CD1 LEU A 9 7.227 -1.365 -6.316 1.00 0.90 C
|
| 101 |
+
ATOM 79 CD2 LEU A 9 5.608 -1.308 -4.399 1.00 0.90 C
|
| 102 |
+
ATOM 80 N LYS A 10 4.686 3.490 -6.747 1.00 0.94 N
|
| 103 |
+
ATOM 81 CA LYS A 10 4.773 4.947 -6.761 1.00 0.94 C
|
| 104 |
+
ATOM 82 C LYS A 10 3.489 5.580 -6.231 1.00 0.94 C
|
| 105 |
+
ATOM 83 CB LYS A 10 5.061 5.454 -8.175 1.00 0.93 C
|
| 106 |
+
ATOM 84 O LYS A 10 3.534 6.594 -5.531 1.00 0.94 O
|
| 107 |
+
ATOM 85 CG LYS A 10 6.475 5.166 -8.660 1.00 0.86 C
|
| 108 |
+
ATOM 86 CD LYS A 10 6.688 5.659 -10.085 1.00 0.81 C
|
| 109 |
+
ATOM 87 CE LYS A 10 8.032 5.206 -10.639 1.00 0.73 C
|
| 110 |
+
ATOM 88 NZ LYS A 10 8.191 5.574 -12.077 1.00 0.64 N
|
| 111 |
+
ATOM 89 N SER A 11 2.412 4.973 -6.576 1.00 0.95 N
|
| 112 |
+
ATOM 90 CA SER A 11 1.124 5.485 -6.118 1.00 0.95 C
|
| 113 |
+
ATOM 91 C SER A 11 0.985 5.357 -4.605 1.00 0.95 C
|
| 114 |
+
ATOM 92 CB SER A 11 -0.022 4.745 -6.808 1.00 0.94 C
|
| 115 |
+
ATOM 93 O SER A 11 0.476 6.266 -3.945 1.00 0.95 O
|
| 116 |
+
ATOM 94 OG SER A 11 -0.073 5.069 -8.187 1.00 0.85 O
|
| 117 |
+
ATOM 95 N ILE A 12 1.404 4.270 -4.118 1.00 0.95 N
|
| 118 |
+
ATOM 96 CA ILE A 12 1.342 4.069 -2.674 1.00 0.96 C
|
| 119 |
+
ATOM 97 C ILE A 12 2.158 5.149 -1.968 1.00 0.95 C
|
| 120 |
+
ATOM 98 CB ILE A 12 1.851 2.666 -2.276 1.00 0.95 C
|
| 121 |
+
ATOM 99 O ILE A 12 1.680 5.776 -1.019 1.00 0.95 O
|
| 122 |
+
ATOM 100 CG1 ILE A 12 0.873 1.587 -2.754 1.00 0.94 C
|
| 123 |
+
ATOM 101 CG2 ILE A 12 2.067 2.579 -0.762 1.00 0.94 C
|
| 124 |
+
ATOM 102 CD1 ILE A 12 1.390 0.165 -2.589 1.00 0.93 C
|
| 125 |
+
ATOM 103 N VAL A 13 3.373 5.365 -2.383 1.00 0.95 N
|
| 126 |
+
ATOM 104 CA VAL A 13 4.255 6.350 -1.765 1.00 0.95 C
|
| 127 |
+
ATOM 105 C VAL A 13 3.625 7.738 -1.859 1.00 0.95 C
|
| 128 |
+
ATOM 106 CB VAL A 13 5.653 6.353 -2.424 1.00 0.94 C
|
| 129 |
+
ATOM 107 O VAL A 13 3.621 8.492 -0.883 1.00 0.94 O
|
| 130 |
+
ATOM 108 CG1 VAL A 13 6.485 7.530 -1.919 1.00 0.92 C
|
| 131 |
+
ATOM 109 CG2 VAL A 13 6.371 5.032 -2.155 1.00 0.92 C
|
| 132 |
+
ATOM 110 N ARG A 14 3.008 8.094 -3.002 1.00 0.94 N
|
| 133 |
+
ATOM 111 CA ARG A 14 2.369 9.390 -3.204 1.00 0.94 C
|
| 134 |
+
ATOM 112 C ARG A 14 1.185 9.570 -2.261 1.00 0.94 C
|
| 135 |
+
ATOM 113 CB ARG A 14 1.911 9.543 -4.656 1.00 0.93 C
|
| 136 |
+
ATOM 114 O ARG A 14 1.009 10.640 -1.674 1.00 0.94 O
|
| 137 |
+
ATOM 115 CG ARG A 14 3.035 9.869 -5.626 1.00 0.83 C
|
| 138 |
+
ATOM 116 CD ARG A 14 2.503 10.221 -7.009 1.00 0.77 C
|
| 139 |
+
ATOM 117 NE ARG A 14 2.183 9.025 -7.783 1.00 0.72 N
|
| 140 |
+
ATOM 118 NH1 ARG A 14 1.327 10.176 -9.594 1.00 0.52 N
|
| 141 |
+
ATOM 119 NH2 ARG A 14 1.383 7.884 -9.609 1.00 0.47 N
|
| 142 |
+
ATOM 120 CZ ARG A 14 1.632 9.031 -8.994 1.00 0.68 C
|
| 143 |
+
ATOM 121 N ILE A 15 0.432 8.498 -2.193 1.00 0.95 N
|
| 144 |
+
ATOM 122 CA ILE A 15 -0.742 8.563 -1.329 1.00 0.95 C
|
| 145 |
+
ATOM 123 C ILE A 15 -0.307 8.788 0.117 1.00 0.95 C
|
| 146 |
+
ATOM 124 CB ILE A 15 -1.596 7.280 -1.438 1.00 0.95 C
|
| 147 |
+
ATOM 125 O ILE A 15 -0.849 9.654 0.807 1.00 0.94 O
|
| 148 |
+
ATOM 126 CG1 ILE A 15 -2.264 7.198 -2.816 1.00 0.93 C
|
| 149 |
+
ATOM 127 CG2 ILE A 15 -2.640 7.229 -0.319 1.00 0.93 C
|
| 150 |
+
ATOM 128 CD1 ILE A 15 -2.880 5.841 -3.126 1.00 0.92 C
|
| 151 |
+
ATOM 129 N LEU A 16 0.689 8.051 0.584 1.00 0.95 N
|
| 152 |
+
ATOM 130 CA LEU A 16 1.143 8.158 1.966 1.00 0.95 C
|
| 153 |
+
ATOM 131 C LEU A 16 1.813 9.505 2.215 1.00 0.95 C
|
| 154 |
+
ATOM 132 CB LEU A 16 2.113 7.022 2.301 1.00 0.95 C
|
| 155 |
+
ATOM 133 O LEU A 16 1.694 10.071 3.304 1.00 0.94 O
|
| 156 |
+
ATOM 134 CG LEU A 16 1.521 5.612 2.331 1.00 0.95 C
|
| 157 |
+
ATOM 135 CD1 LEU A 16 2.609 4.587 2.634 1.00 0.94 C
|
| 158 |
+
ATOM 136 CD2 LEU A 16 0.397 5.525 3.357 1.00 0.93 C
|
| 159 |
+
ATOM 137 N GLU A 17 2.509 10.095 1.227 1.00 0.94 N
|
| 160 |
+
ATOM 138 CA GLU A 17 3.177 11.387 1.356 1.00 0.94 C
|
| 161 |
+
ATOM 139 C GLU A 17 2.165 12.522 1.484 1.00 0.93 C
|
| 162 |
+
ATOM 140 CB GLU A 17 4.100 11.637 0.161 1.00 0.92 C
|
| 163 |
+
ATOM 141 O GLU A 17 2.413 13.505 2.184 1.00 0.92 O
|
| 164 |
+
ATOM 142 CG GLU A 17 5.412 10.868 0.225 1.00 0.86 C
|
| 165 |
+
ATOM 143 CD GLU A 17 6.272 11.044 -1.016 1.00 0.81 C
|
| 166 |
+
ATOM 144 OE1 GLU A 17 5.739 11.459 -2.070 1.00 0.78 O
|
| 167 |
+
ATOM 145 OE2 GLU A 17 7.489 10.764 -0.934 1.00 0.76 O
|
| 168 |
+
ATOM 146 N ARG A 18 1.030 12.289 0.890 1.00 0.93 N
|
| 169 |
+
ATOM 147 CA ARG A 18 0.060 13.378 0.835 1.00 0.93 C
|
| 170 |
+
ATOM 148 C ARG A 18 -0.916 13.301 2.003 1.00 0.92 C
|
| 171 |
+
ATOM 149 CB ARG A 18 -0.705 13.351 -0.490 1.00 0.91 C
|
| 172 |
+
ATOM 150 O ARG A 18 -1.588 14.284 2.323 1.00 0.91 O
|
| 173 |
+
ATOM 151 CG ARG A 18 0.139 13.731 -1.697 1.00 0.83 C
|
| 174 |
+
ATOM 152 CD ARG A 18 -0.656 13.640 -2.992 1.00 0.79 C
|
| 175 |
+
ATOM 153 NE ARG A 18 0.139 14.059 -4.143 1.00 0.73 N
|
| 176 |
+
ATOM 154 NH1 ARG A 18 -1.546 13.730 -5.688 1.00 0.56 N
|
| 177 |
+
ATOM 155 NH2 ARG A 18 0.507 14.490 -6.368 1.00 0.51 N
|
| 178 |
+
ATOM 156 CZ ARG A 18 -0.302 14.092 -5.397 1.00 0.71 C
|
| 179 |
+
ATOM 157 N SER A 19 -0.853 12.101 2.564 1.00 0.92 N
|
| 180 |
+
ATOM 158 CA SER A 19 -1.857 11.925 3.607 1.00 0.92 C
|
| 181 |
+
ATOM 159 C SER A 19 -1.372 12.481 4.942 1.00 0.91 C
|
| 182 |
+
ATOM 160 CB SER A 19 -2.215 10.446 3.762 1.00 0.91 C
|
| 183 |
+
ATOM 161 O SER A 19 -0.232 12.239 5.344 1.00 0.90 O
|
| 184 |
+
ATOM 162 OG SER A 19 -3.219 10.272 4.748 1.00 0.85 O
|
| 185 |
+
ATOM 163 N LYS A 20 -2.211 13.277 5.567 1.00 0.89 N
|
| 186 |
+
ATOM 164 CA LYS A 20 -1.915 13.785 6.903 1.00 0.89 C
|
| 187 |
+
ATOM 165 C LYS A 20 -2.272 12.759 7.974 1.00 0.89 C
|
| 188 |
+
ATOM 166 CB LYS A 20 -2.667 15.092 7.158 1.00 0.87 C
|
| 189 |
+
ATOM 167 O LYS A 20 -1.768 12.824 9.097 1.00 0.87 O
|
| 190 |
+
ATOM 168 CG LYS A 20 -2.206 16.252 6.287 1.00 0.79 C
|
| 191 |
+
ATOM 169 CD LYS A 20 -2.955 17.535 6.623 1.00 0.76 C
|
| 192 |
+
ATOM 170 CE LYS A 20 -2.516 18.689 5.732 1.00 0.68 C
|
| 193 |
+
ATOM 171 NZ LYS A 20 -3.289 19.935 6.018 1.00 0.61 N
|
| 194 |
+
ATOM 172 N GLU A 21 -3.098 11.887 7.683 1.00 0.92 N
|
| 195 |
+
ATOM 173 CA GLU A 21 -3.549 10.825 8.577 1.00 0.92 C
|
| 196 |
+
ATOM 174 C GLU A 21 -3.148 9.451 8.047 1.00 0.92 C
|
| 197 |
+
ATOM 175 CB GLU A 21 -5.066 10.892 8.770 1.00 0.90 C
|
| 198 |
+
ATOM 176 O GLU A 21 -2.881 9.294 6.854 1.00 0.92 O
|
| 199 |
+
ATOM 177 CG GLU A 21 -5.548 12.188 9.406 1.00 0.84 C
|
| 200 |
+
ATOM 178 CD GLU A 21 -7.025 12.165 9.769 1.00 0.80 C
|
| 201 |
+
ATOM 179 OE1 GLU A 21 -7.741 11.233 9.339 1.00 0.78 O
|
| 202 |
+
ATOM 180 OE2 GLU A 21 -7.468 13.088 10.489 1.00 0.74 O
|
| 203 |
+
ATOM 181 N PRO A 22 -3.069 8.392 8.977 1.00 0.94 N
|
| 204 |
+
ATOM 182 CA PRO A 22 -2.802 7.027 8.516 1.00 0.95 C
|
| 205 |
+
ATOM 183 C PRO A 22 -3.802 6.555 7.462 1.00 0.95 C
|
| 206 |
+
ATOM 184 CB PRO A 22 -2.921 6.193 9.794 1.00 0.94 C
|
| 207 |
+
ATOM 185 O PRO A 22 -4.990 6.878 7.544 1.00 0.94 O
|
| 208 |
+
ATOM 186 CG PRO A 22 -2.711 7.168 10.907 1.00 0.93 C
|
| 209 |
+
ATOM 187 CD PRO A 22 -3.262 8.500 10.485 1.00 0.91 C
|
| 210 |
+
ATOM 188 N VAL A 23 -3.318 5.865 6.476 1.00 0.95 N
|
| 211 |
+
ATOM 189 CA VAL A 23 -4.137 5.301 5.408 1.00 0.95 C
|
| 212 |
+
ATOM 190 C VAL A 23 -4.274 3.793 5.603 1.00 0.95 C
|
| 213 |
+
ATOM 191 CB VAL A 23 -3.543 5.607 4.015 1.00 0.95 C
|
| 214 |
+
ATOM 192 O VAL A 23 -3.273 3.074 5.652 1.00 0.95 O
|
| 215 |
+
ATOM 193 CG1 VAL A 23 -4.477 5.115 2.910 1.00 0.93 C
|
| 216 |
+
ATOM 194 CG2 VAL A 23 -3.276 7.103 3.866 1.00 0.93 C
|
| 217 |
+
ATOM 195 N SER A 24 -5.474 3.345 5.655 1.00 0.96 N
|
| 218 |
+
ATOM 196 CA SER A 24 -5.672 1.925 5.923 1.00 0.96 C
|
| 219 |
+
ATOM 197 C SER A 24 -5.316 1.077 4.706 1.00 0.96 C
|
| 220 |
+
ATOM 198 CB SER A 24 -7.119 1.653 6.338 1.00 0.95 C
|
| 221 |
+
ATOM 199 O SER A 24 -5.386 1.552 3.571 1.00 0.95 O
|
| 222 |
+
ATOM 200 OG SER A 24 -7.981 1.698 5.214 1.00 0.91 O
|
| 223 |
+
ATOM 201 N GLY A 25 -4.862 -0.173 5.000 1.00 0.95 N
|
| 224 |
+
ATOM 202 CA GLY A 25 -4.628 -1.110 3.914 1.00 0.95 C
|
| 225 |
+
ATOM 203 C GLY A 25 -5.840 -1.304 3.021 1.00 0.95 C
|
| 226 |
+
ATOM 204 O GLY A 25 -5.706 -1.426 1.802 1.00 0.95 O
|
| 227 |
+
ATOM 205 N ALA A 26 -7.029 -1.338 3.592 1.00 0.95 N
|
| 228 |
+
ATOM 206 CA ALA A 26 -8.269 -1.507 2.839 1.00 0.95 C
|
| 229 |
+
ATOM 207 C ALA A 26 -8.487 -0.346 1.873 1.00 0.95 C
|
| 230 |
+
ATOM 208 CB ALA A 26 -9.456 -1.633 3.790 1.00 0.94 C
|
| 231 |
+
ATOM 209 O ALA A 26 -8.901 -0.551 0.729 1.00 0.95 O
|
| 232 |
+
ATOM 210 N GLN A 27 -8.247 0.886 2.350 1.00 0.95 N
|
| 233 |
+
ATOM 211 CA GLN A 27 -8.373 2.066 1.501 1.00 0.95 C
|
| 234 |
+
ATOM 212 C GLN A 27 -7.389 2.013 0.335 1.00 0.95 C
|
| 235 |
+
ATOM 213 CB GLN A 27 -8.151 3.341 2.317 1.00 0.94 C
|
| 236 |
+
ATOM 214 O GLN A 27 -7.757 2.295 -0.807 1.00 0.95 O
|
| 237 |
+
ATOM 215 CG GLN A 27 -8.308 4.623 1.511 1.00 0.86 C
|
| 238 |
+
ATOM 216 CD GLN A 27 -8.034 5.870 2.330 1.00 0.81 C
|
| 239 |
+
ATOM 217 NE2 GLN A 27 -7.923 7.010 1.657 1.00 0.73 N
|
| 240 |
+
ATOM 218 OE1 GLN A 27 -7.923 5.809 3.559 1.00 0.78 O
|
| 241 |
+
ATOM 219 N LEU A 28 -6.152 1.593 0.619 1.00 0.96 N
|
| 242 |
+
ATOM 220 CA LEU A 28 -5.160 1.467 -0.444 1.00 0.96 C
|
| 243 |
+
ATOM 221 C LEU A 28 -5.574 0.395 -1.448 1.00 0.96 C
|
| 244 |
+
ATOM 222 CB LEU A 28 -3.787 1.131 0.142 1.00 0.95 C
|
| 245 |
+
ATOM 223 O LEU A 28 -5.474 0.602 -2.659 1.00 0.95 O
|
| 246 |
+
ATOM 224 CG LEU A 28 -3.083 2.253 0.906 1.00 0.95 C
|
| 247 |
+
ATOM 225 CD1 LEU A 28 -1.839 1.718 1.608 1.00 0.93 C
|
| 248 |
+
ATOM 226 CD2 LEU A 28 -2.721 3.397 -0.035 1.00 0.92 C
|
| 249 |
+
ATOM 227 N ALA A 29 -6.050 -0.750 -0.922 1.00 0.96 N
|
| 250 |
+
ATOM 228 CA ALA A 29 -6.463 -1.859 -1.778 1.00 0.96 C
|
| 251 |
+
ATOM 229 C ALA A 29 -7.599 -1.441 -2.708 1.00 0.96 C
|
| 252 |
+
ATOM 230 CB ALA A 29 -6.886 -3.056 -0.931 1.00 0.95 C
|
| 253 |
+
ATOM 231 O ALA A 29 -7.574 -1.741 -3.903 1.00 0.95 O
|
| 254 |
+
ATOM 232 N GLU A 30 -8.555 -0.688 -2.175 1.00 0.96 N
|
| 255 |
+
ATOM 233 CA GLU A 30 -9.687 -0.188 -2.949 1.00 0.95 C
|
| 256 |
+
ATOM 234 C GLU A 30 -9.235 0.828 -3.995 1.00 0.95 C
|
| 257 |
+
ATOM 235 CB GLU A 30 -10.735 0.440 -2.026 1.00 0.94 C
|
| 258 |
+
ATOM 236 O GLU A 30 -9.618 0.736 -5.163 1.00 0.94 O
|
| 259 |
+
ATOM 237 CG GLU A 30 -12.004 0.876 -2.744 1.00 0.85 C
|
| 260 |
+
ATOM 238 CD GLU A 30 -13.067 1.424 -1.804 1.00 0.79 C
|
| 261 |
+
ATOM 239 OE1 GLU A 30 -12.798 1.542 -0.587 1.00 0.77 O
|
| 262 |
+
ATOM 240 OE2 GLU A 30 -14.177 1.736 -2.288 1.00 0.74 O
|
| 263 |
+
ATOM 241 N GLU A 31 -8.407 1.749 -3.572 1.00 0.95 N
|
| 264 |
+
ATOM 242 CA GLU A 31 -7.963 2.822 -4.456 1.00 0.94 C
|
| 265 |
+
ATOM 243 C GLU A 31 -7.129 2.277 -5.612 1.00 0.94 C
|
| 266 |
+
ATOM 244 CB GLU A 31 -7.159 3.864 -3.674 1.00 0.92 C
|
| 267 |
+
ATOM 245 O GLU A 31 -7.197 2.791 -6.730 1.00 0.93 O
|
| 268 |
+
ATOM 246 CG GLU A 31 -6.777 5.089 -4.493 1.00 0.78 C
|
| 269 |
+
ATOM 247 CD GLU A 31 -6.323 6.263 -3.642 1.00 0.72 C
|
| 270 |
+
ATOM 248 OE1 GLU A 31 -6.684 6.319 -2.444 1.00 0.68 O
|
| 271 |
+
ATOM 249 OE2 GLU A 31 -5.603 7.135 -4.176 1.00 0.66 O
|
| 272 |
+
ATOM 250 N LEU A 32 -6.421 1.249 -5.368 1.00 0.94 N
|
| 273 |
+
ATOM 251 CA LEU A 32 -5.480 0.768 -6.373 1.00 0.94 C
|
| 274 |
+
ATOM 252 C LEU A 32 -6.005 -0.494 -7.050 1.00 0.94 C
|
| 275 |
+
ATOM 253 CB LEU A 32 -4.114 0.489 -5.738 1.00 0.94 C
|
| 276 |
+
ATOM 254 O LEU A 32 -5.323 -1.081 -7.893 1.00 0.92 O
|
| 277 |
+
ATOM 255 CG LEU A 32 -3.386 1.694 -5.140 1.00 0.92 C
|
| 278 |
+
ATOM 256 CD1 LEU A 32 -2.156 1.238 -4.363 1.00 0.89 C
|
| 279 |
+
ATOM 257 CD2 LEU A 32 -2.998 2.682 -6.234 1.00 0.89 C
|
| 280 |
+
ATOM 258 N SER A 33 -7.187 -0.923 -6.692 1.00 0.95 N
|
| 281 |
+
ATOM 259 CA SER A 33 -7.876 -2.070 -7.275 1.00 0.94 C
|
| 282 |
+
ATOM 260 C SER A 33 -7.028 -3.333 -7.174 1.00 0.94 C
|
| 283 |
+
ATOM 261 CB SER A 33 -8.228 -1.798 -8.738 1.00 0.93 C
|
| 284 |
+
ATOM 262 O SER A 33 -6.863 -4.057 -8.159 1.00 0.93 O
|
| 285 |
+
ATOM 263 OG SER A 33 -9.149 -0.726 -8.842 1.00 0.85 O
|
| 286 |
+
ATOM 264 N VAL A 34 -6.509 -3.606 -5.951 1.00 0.94 N
|
| 287 |
+
ATOM 265 CA VAL A 34 -5.794 -4.840 -5.646 1.00 0.94 C
|
| 288 |
+
ATOM 266 C VAL A 34 -6.250 -5.380 -4.292 1.00 0.94 C
|
| 289 |
+
ATOM 267 CB VAL A 34 -4.264 -4.625 -5.645 1.00 0.94 C
|
| 290 |
+
ATOM 268 O VAL A 34 -7.013 -4.724 -3.580 1.00 0.94 O
|
| 291 |
+
ATOM 269 CG1 VAL A 34 -3.771 -4.252 -7.042 1.00 0.91 C
|
| 292 |
+
ATOM 270 CG2 VAL A 34 -3.878 -3.547 -4.634 1.00 0.91 C
|
| 293 |
+
ATOM 271 N SER A 35 -5.841 -6.605 -4.013 1.00 0.95 N
|
| 294 |
+
ATOM 272 CA SER A 35 -6.185 -7.200 -2.725 1.00 0.95 C
|
| 295 |
+
ATOM 273 C SER A 35 -5.372 -6.579 -1.594 1.00 0.95 C
|
| 296 |
+
ATOM 274 CB SER A 35 -5.960 -8.712 -2.756 1.00 0.94 C
|
| 297 |
+
ATOM 275 O SER A 35 -4.314 -5.993 -1.833 1.00 0.95 O
|
| 298 |
+
ATOM 276 OG SER A 35 -4.575 -9.014 -2.769 1.00 0.87 O
|
| 299 |
+
ATOM 277 N ARG A 36 -5.839 -6.717 -0.444 1.00 0.96 N
|
| 300 |
+
ATOM 278 CA ARG A 36 -5.113 -6.257 0.736 1.00 0.96 C
|
| 301 |
+
ATOM 279 C ARG A 36 -3.765 -6.959 0.857 1.00 0.96 C
|
| 302 |
+
ATOM 280 CB ARG A 36 -5.941 -6.489 2.001 1.00 0.93 C
|
| 303 |
+
ATOM 281 O ARG A 36 -2.785 -6.360 1.306 1.00 0.95 O
|
| 304 |
+
ATOM 282 CG ARG A 36 -7.147 -5.571 2.126 1.00 0.76 C
|
| 305 |
+
ATOM 283 CD ARG A 36 -7.872 -5.769 3.449 1.00 0.72 C
|
| 306 |
+
ATOM 284 NE ARG A 36 -7.044 -5.369 4.583 1.00 0.69 N
|
| 307 |
+
ATOM 285 NH1 ARG A 36 -8.302 -6.492 6.163 1.00 0.58 N
|
| 308 |
+
ATOM 286 NH2 ARG A 36 -6.450 -5.302 6.800 1.00 0.52 N
|
| 309 |
+
ATOM 287 CZ ARG A 36 -7.267 -5.722 5.846 1.00 0.64 C
|
| 310 |
+
ATOM 288 N GLN A 37 -3.757 -8.247 0.479 1.00 0.96 N
|
| 311 |
+
ATOM 289 CA GLN A 37 -2.512 -9.005 0.531 1.00 0.96 C
|
| 312 |
+
ATOM 290 C GLN A 37 -1.446 -8.374 -0.361 1.00 0.96 C
|
| 313 |
+
ATOM 291 CB GLN A 37 -2.749 -10.458 0.117 1.00 0.94 C
|
| 314 |
+
ATOM 292 O GLN A 37 -0.273 -8.312 0.013 1.00 0.96 O
|
| 315 |
+
ATOM 293 CG GLN A 37 -1.503 -11.329 0.189 1.00 0.79 C
|
| 316 |
+
ATOM 294 CD GLN A 37 -1.003 -11.522 1.609 1.00 0.72 C
|
| 317 |
+
ATOM 295 NE2 GLN A 37 0.254 -11.931 1.744 1.00 0.59 N
|
| 318 |
+
ATOM 296 OE1 GLN A 37 -1.740 -11.305 2.576 1.00 0.68 O
|
| 319 |
+
ATOM 297 N VAL A 38 -1.860 -7.916 -1.492 1.00 0.96 N
|
| 320 |
+
ATOM 298 CA VAL A 38 -0.934 -7.261 -2.410 1.00 0.96 C
|
| 321 |
+
ATOM 299 C VAL A 38 -0.384 -5.989 -1.769 1.00 0.96 C
|
| 322 |
+
ATOM 300 CB VAL A 38 -1.612 -6.928 -3.758 1.00 0.95 C
|
| 323 |
+
ATOM 301 O VAL A 38 0.810 -5.699 -1.875 1.00 0.96 O
|
| 324 |
+
ATOM 302 CG1 VAL A 38 -0.732 -5.996 -4.589 1.00 0.93 C
|
| 325 |
+
ATOM 303 CG2 VAL A 38 -1.919 -8.209 -4.531 1.00 0.92 C
|
| 326 |
+
ATOM 304 N ILE A 39 -1.219 -5.306 -1.044 1.00 0.96 N
|
| 327 |
+
ATOM 305 CA ILE A 39 -0.801 -4.074 -0.385 1.00 0.96 C
|
| 328 |
+
ATOM 306 C ILE A 39 0.250 -4.387 0.677 1.00 0.96 C
|
| 329 |
+
ATOM 307 CB ILE A 39 -2.002 -3.337 0.250 1.00 0.96 C
|
| 330 |
+
ATOM 308 O ILE A 39 1.263 -3.692 0.781 1.00 0.96 O
|
| 331 |
+
ATOM 309 CG1 ILE A 39 -2.970 -2.857 -0.838 1.00 0.94 C
|
| 332 |
+
ATOM 310 CG2 ILE A 39 -1.522 -2.166 1.112 1.00 0.94 C
|
| 333 |
+
ATOM 311 CD1 ILE A 39 -2.336 -1.930 -1.865 1.00 0.91 C
|
| 334 |
+
ATOM 312 N VAL A 40 0.033 -5.399 1.497 1.00 0.96 N
|
| 335 |
+
ATOM 313 CA VAL A 40 0.963 -5.793 2.550 1.00 0.96 C
|
| 336 |
+
ATOM 314 C VAL A 40 2.317 -6.146 1.940 1.00 0.97 C
|
| 337 |
+
ATOM 315 CB VAL A 40 0.420 -6.987 3.368 1.00 0.96 C
|
| 338 |
+
ATOM 316 O VAL A 40 3.362 -5.740 2.454 1.00 0.96 O
|
| 339 |
+
ATOM 317 CG1 VAL A 40 1.503 -7.549 4.287 1.00 0.89 C
|
| 340 |
+
ATOM 318 CG2 VAL A 40 -0.806 -6.565 4.177 1.00 0.89 C
|
| 341 |
+
ATOM 319 N GLN A 41 2.280 -6.826 0.823 1.00 0.96 N
|
| 342 |
+
ATOM 320 CA GLN A 41 3.510 -7.216 0.142 1.00 0.96 C
|
| 343 |
+
ATOM 321 C GLN A 41 4.236 -5.998 -0.424 1.00 0.96 C
|
| 344 |
+
ATOM 322 CB GLN A 41 3.211 -8.215 -0.977 1.00 0.95 C
|
| 345 |
+
ATOM 323 O GLN A 41 5.463 -5.909 -0.348 1.00 0.95 O
|
| 346 |
+
ATOM 324 CG GLN A 41 2.779 -9.586 -0.477 1.00 0.86 C
|
| 347 |
+
ATOM 325 CD GLN A 41 2.333 -10.506 -1.598 1.00 0.79 C
|
| 348 |
+
ATOM 326 NE2 GLN A 41 2.103 -11.773 -1.270 1.00 0.69 N
|
| 349 |
+
ATOM 327 OE1 GLN A 41 2.196 -10.082 -2.750 1.00 0.77 O
|
| 350 |
+
ATOM 328 N ASP A 42 3.536 -5.124 -0.987 1.00 0.96 N
|
| 351 |
+
ATOM 329 CA ASP A 42 4.119 -3.918 -1.568 1.00 0.96 C
|
| 352 |
+
ATOM 330 C ASP A 42 4.728 -3.027 -0.487 1.00 0.96 C
|
| 353 |
+
ATOM 331 CB ASP A 42 3.066 -3.140 -2.359 1.00 0.95 C
|
| 354 |
+
ATOM 332 O ASP A 42 5.806 -2.461 -0.677 1.00 0.95 O
|
| 355 |
+
ATOM 333 CG ASP A 42 2.757 -3.764 -3.709 1.00 0.94 C
|
| 356 |
+
ATOM 334 OD1 ASP A 42 3.553 -4.599 -4.191 1.00 0.91 O
|
| 357 |
+
ATOM 335 OD2 ASP A 42 1.711 -3.416 -4.297 1.00 0.92 O
|
| 358 |
+
ATOM 336 N ILE A 43 4.006 -2.921 0.639 1.00 0.96 N
|
| 359 |
+
ATOM 337 CA ILE A 43 4.512 -2.107 1.738 1.00 0.96 C
|
| 360 |
+
ATOM 338 C ILE A 43 5.806 -2.713 2.276 1.00 0.96 C
|
| 361 |
+
ATOM 339 CB ILE A 43 3.470 -1.976 2.871 1.00 0.96 C
|
| 362 |
+
ATOM 340 O ILE A 43 6.770 -1.994 2.548 1.00 0.96 O
|
| 363 |
+
ATOM 341 CG1 ILE A 43 2.293 -1.106 2.416 1.00 0.94 C
|
| 364 |
+
ATOM 342 CG2 ILE A 43 4.117 -1.404 4.136 1.00 0.93 C
|
| 365 |
+
ATOM 343 CD1 ILE A 43 2.657 0.352 2.174 1.00 0.91 C
|
| 366 |
+
ATOM 344 N ALA A 44 5.851 -4.052 2.411 1.00 0.96 N
|
| 367 |
+
ATOM 345 CA ALA A 44 7.075 -4.731 2.830 1.00 0.96 C
|
| 368 |
+
ATOM 346 C ALA A 44 8.218 -4.451 1.859 1.00 0.96 C
|
| 369 |
+
ATOM 347 CB ALA A 44 6.837 -6.234 2.947 1.00 0.96 C
|
| 370 |
+
ATOM 348 O ALA A 44 9.353 -4.213 2.278 1.00 0.96 O
|
| 371 |
+
ATOM 349 N TYR A 45 7.872 -4.425 0.639 1.00 0.95 N
|
| 372 |
+
ATOM 350 CA TYR A 45 8.877 -4.165 -0.387 1.00 0.95 C
|
| 373 |
+
ATOM 351 C TYR A 45 9.363 -2.723 -0.320 1.00 0.95 C
|
| 374 |
+
ATOM 352 CB TYR A 45 8.312 -4.461 -1.780 1.00 0.95 C
|
| 375 |
+
ATOM 353 O TYR A 45 10.568 -2.464 -0.380 1.00 0.95 O
|
| 376 |
+
ATOM 354 CG TYR A 45 9.316 -4.279 -2.893 1.00 0.93 C
|
| 377 |
+
ATOM 355 CD1 TYR A 45 10.439 -5.098 -2.985 1.00 0.89 C
|
| 378 |
+
ATOM 356 CD2 TYR A 45 9.143 -3.288 -3.853 1.00 0.88 C
|
| 379 |
+
ATOM 357 CE1 TYR A 45 11.367 -4.933 -4.008 1.00 0.88 C
|
| 380 |
+
ATOM 358 CE2 TYR A 45 10.065 -3.113 -4.880 1.00 0.88 C
|
| 381 |
+
ATOM 359 OH TYR A 45 12.088 -3.771 -5.963 1.00 0.79 O
|
| 382 |
+
ATOM 360 CZ TYR A 45 11.172 -3.940 -4.949 1.00 0.86 C
|
| 383 |
+
ATOM 361 N LEU A 46 8.521 -1.815 -0.237 1.00 0.95 N
|
| 384 |
+
ATOM 362 CA LEU A 46 8.894 -0.409 -0.123 1.00 0.95 C
|
| 385 |
+
ATOM 363 C LEU A 46 9.792 -0.181 1.088 1.00 0.95 C
|
| 386 |
+
ATOM 364 CB LEU A 46 7.645 0.470 -0.019 1.00 0.95 C
|
| 387 |
+
ATOM 365 O LEU A 46 10.747 0.595 1.021 1.00 0.95 O
|
| 388 |
+
ATOM 366 CG LEU A 46 6.847 0.669 -1.309 1.00 0.95 C
|
| 389 |
+
ATOM 367 CD1 LEU A 46 5.518 1.356 -1.010 1.00 0.93 C
|
| 390 |
+
ATOM 368 CD2 LEU A 46 7.656 1.476 -2.319 1.00 0.93 C
|
| 391 |
+
ATOM 369 N ARG A 47 9.531 -0.912 2.192 1.00 0.96 N
|
| 392 |
+
ATOM 370 CA ARG A 47 10.387 -0.815 3.370 1.00 0.96 C
|
| 393 |
+
ATOM 371 C ARG A 47 11.791 -1.332 3.072 1.00 0.96 C
|
| 394 |
+
ATOM 372 CB ARG A 47 9.781 -1.591 4.541 1.00 0.95 C
|
| 395 |
+
ATOM 373 O ARG A 47 12.781 -0.740 3.506 1.00 0.95 O
|
| 396 |
+
ATOM 374 CG ARG A 47 8.562 -0.923 5.158 1.00 0.93 C
|
| 397 |
+
ATOM 375 CD ARG A 47 8.006 -1.730 6.323 1.00 0.91 C
|
| 398 |
+
ATOM 376 NE ARG A 47 6.966 -0.996 7.039 1.00 0.90 N
|
| 399 |
+
ATOM 377 NH1 ARG A 47 5.819 -2.880 7.724 1.00 0.83 N
|
| 400 |
+
ATOM 378 NH2 ARG A 47 5.061 -0.797 8.306 1.00 0.81 N
|
| 401 |
+
ATOM 379 CZ ARG A 47 5.951 -1.559 7.688 1.00 0.87 C
|
| 402 |
+
ATOM 380 N SER A 48 11.822 -2.364 2.346 1.00 0.96 N
|
| 403 |
+
ATOM 381 CA SER A 48 13.124 -2.927 2.003 1.00 0.96 C
|
| 404 |
+
ATOM 382 C SER A 48 13.929 -1.969 1.131 1.00 0.95 C
|
| 405 |
+
ATOM 383 CB SER A 48 12.957 -4.266 1.283 1.00 0.95 C
|
| 406 |
+
ATOM 384 O SER A 48 15.159 -2.041 1.091 1.00 0.94 O
|
| 407 |
+
ATOM 385 OG SER A 48 12.598 -4.066 -0.073 1.00 0.90 O
|
| 408 |
+
ATOM 386 N LEU A 49 13.228 -1.054 0.452 1.00 0.95 N
|
| 409 |
+
ATOM 387 CA LEU A 49 13.904 -0.075 -0.393 1.00 0.94 C
|
| 410 |
+
ATOM 388 C LEU A 49 14.342 1.137 0.423 1.00 0.94 C
|
| 411 |
+
ATOM 389 CB LEU A 49 12.989 0.368 -1.537 1.00 0.94 C
|
| 412 |
+
ATOM 390 O LEU A 49 14.979 2.049 -0.107 1.00 0.92 O
|
| 413 |
+
ATOM 391 CG LEU A 49 12.691 -0.678 -2.612 1.00 0.89 C
|
| 414 |
+
ATOM 392 CD1 LEU A 49 11.737 -0.109 -3.656 1.00 0.83 C
|
| 415 |
+
ATOM 393 CD2 LEU A 49 13.984 -1.156 -3.265 1.00 0.83 C
|
| 416 |
+
ATOM 394 N GLY A 50 13.868 1.196 1.683 1.00 0.94 N
|
| 417 |
+
ATOM 395 CA GLY A 50 14.344 2.263 2.549 1.00 0.94 C
|
| 418 |
+
ATOM 396 C GLY A 50 13.258 3.253 2.927 1.00 0.94 C
|
| 419 |
+
ATOM 397 O GLY A 50 13.514 4.216 3.652 1.00 0.93 O
|
| 420 |
+
ATOM 398 N TYR A 51 12.070 3.128 2.433 1.00 0.94 N
|
| 421 |
+
ATOM 399 CA TYR A 51 10.982 4.007 2.846 1.00 0.94 C
|
| 422 |
+
ATOM 400 C TYR A 51 10.598 3.753 4.299 1.00 0.94 C
|
| 423 |
+
ATOM 401 CB TYR A 51 9.760 3.812 1.942 1.00 0.94 C
|
| 424 |
+
ATOM 402 O TYR A 51 10.466 2.601 4.721 1.00 0.93 O
|
| 425 |
+
ATOM 403 CG TYR A 51 9.971 4.290 0.526 1.00 0.93 C
|
| 426 |
+
ATOM 404 CD1 TYR A 51 9.767 5.625 0.183 1.00 0.92 C
|
| 427 |
+
ATOM 405 CD2 TYR A 51 10.373 3.408 -0.472 1.00 0.91 C
|
| 428 |
+
ATOM 406 CE1 TYR A 51 9.958 6.069 -1.121 1.00 0.91 C
|
| 429 |
+
ATOM 407 CE2 TYR A 51 10.567 3.842 -1.779 1.00 0.91 C
|
| 430 |
+
ATOM 408 OH TYR A 51 10.548 5.607 -3.386 1.00 0.87 O
|
| 431 |
+
ATOM 409 CZ TYR A 51 10.358 5.172 -2.093 1.00 0.91 C
|
| 432 |
+
ATOM 410 N ASN A 52 10.429 4.810 5.078 1.00 0.95 N
|
| 433 |
+
ATOM 411 CA ASN A 52 10.061 4.716 6.487 1.00 0.95 C
|
| 434 |
+
ATOM 412 C ASN A 52 8.547 4.669 6.670 1.00 0.95 C
|
| 435 |
+
ATOM 413 CB ASN A 52 10.656 5.885 7.276 1.00 0.94 C
|
| 436 |
+
ATOM 414 O ASN A 52 7.960 5.581 7.255 1.00 0.94 O
|
| 437 |
+
ATOM 415 CG ASN A 52 10.661 5.638 8.772 1.00 0.86 C
|
| 438 |
+
ATOM 416 ND2 ASN A 52 10.958 6.677 9.543 1.00 0.77 N
|
| 439 |
+
ATOM 417 OD1 ASN A 52 10.401 4.522 9.230 1.00 0.75 O
|
| 440 |
+
ATOM 418 N ILE A 53 7.934 3.537 6.187 1.00 0.95 N
|
| 441 |
+
ATOM 419 CA ILE A 53 6.498 3.323 6.325 1.00 0.96 C
|
| 442 |
+
ATOM 420 C ILE A 53 6.209 2.590 7.633 1.00 0.95 C
|
| 443 |
+
ATOM 421 CB ILE A 53 5.927 2.530 5.128 1.00 0.95 C
|
| 444 |
+
ATOM 422 O ILE A 53 6.695 1.477 7.849 1.00 0.94 O
|
| 445 |
+
ATOM 423 CG1 ILE A 53 6.259 3.239 3.810 1.00 0.94 C
|
| 446 |
+
ATOM 424 CG2 ILE A 53 4.416 2.335 5.281 1.00 0.94 C
|
| 447 |
+
ATOM 425 CD1 ILE A 53 5.911 2.431 2.568 1.00 0.93 C
|
| 448 |
+
ATOM 426 N VAL A 54 5.441 3.118 8.475 1.00 0.95 N
|
| 449 |
+
ATOM 427 CA VAL A 54 5.115 2.530 9.770 1.00 0.95 C
|
| 450 |
+
ATOM 428 C VAL A 54 3.650 2.103 9.791 1.00 0.95 C
|
| 451 |
+
ATOM 429 CB VAL A 54 5.400 3.514 10.927 1.00 0.94 C
|
| 452 |
+
ATOM 430 O VAL A 54 2.779 2.826 9.299 1.00 0.95 O
|
| 453 |
+
ATOM 431 CG1 VAL A 54 4.993 2.903 12.267 1.00 0.78 C
|
| 454 |
+
ATOM 432 CG2 VAL A 54 6.876 3.906 10.942 1.00 0.78 C
|
| 455 |
+
ATOM 433 N ALA A 55 3.436 0.986 10.239 1.00 0.95 N
|
| 456 |
+
ATOM 434 CA ALA A 55 2.086 0.462 10.428 1.00 0.95 C
|
| 457 |
+
ATOM 435 C ALA A 55 1.527 0.864 11.790 1.00 0.94 C
|
| 458 |
+
ATOM 436 CB ALA A 55 2.080 -1.057 10.281 1.00 0.94 C
|
| 459 |
+
ATOM 437 O ALA A 55 2.203 0.723 12.811 1.00 0.93 O
|
| 460 |
+
ATOM 438 N THR A 56 0.347 1.432 11.827 1.00 0.93 N
|
| 461 |
+
ATOM 439 CA THR A 56 -0.403 1.797 13.024 1.00 0.93 C
|
| 462 |
+
ATOM 440 C THR A 56 -1.730 1.046 13.080 1.00 0.93 C
|
| 463 |
+
ATOM 441 CB THR A 56 -0.665 3.314 13.077 1.00 0.92 C
|
| 464 |
+
ATOM 442 O THR A 56 -2.129 0.405 12.106 1.00 0.92 O
|
| 465 |
+
ATOM 443 CG2 THR A 56 0.585 4.102 12.702 1.00 0.87 C
|
| 466 |
+
ATOM 444 OG1 THR A 56 -1.717 3.643 12.161 1.00 0.88 O
|
| 467 |
+
ATOM 445 N PRO A 57 -2.436 0.978 14.221 1.00 0.92 N
|
| 468 |
+
ATOM 446 CA PRO A 57 -3.759 0.351 14.257 1.00 0.92 C
|
| 469 |
+
ATOM 447 C PRO A 57 -4.713 0.927 13.214 1.00 0.91 C
|
| 470 |
+
ATOM 448 CB PRO A 57 -4.253 0.651 15.675 1.00 0.91 C
|
| 471 |
+
ATOM 449 O PRO A 57 -5.663 0.255 12.803 1.00 0.89 O
|
| 472 |
+
ATOM 450 CG PRO A 57 -3.010 0.760 16.497 1.00 0.89 C
|
| 473 |
+
ATOM 451 CD PRO A 57 -1.939 1.398 15.660 1.00 0.88 C
|
| 474 |
+
ATOM 452 N ARG A 58 -4.402 2.198 12.767 1.00 0.94 N
|
| 475 |
+
ATOM 453 CA ARG A 58 -5.325 2.830 11.831 1.00 0.94 C
|
| 476 |
+
ATOM 454 C ARG A 58 -4.823 2.702 10.396 1.00 0.93 C
|
| 477 |
+
ATOM 455 CB ARG A 58 -5.525 4.305 12.186 1.00 0.92 C
|
| 478 |
+
ATOM 456 O ARG A 58 -5.517 3.089 9.454 1.00 0.92 O
|
| 479 |
+
ATOM 457 CG ARG A 58 -6.256 4.529 13.500 1.00 0.86 C
|
| 480 |
+
ATOM 458 CD ARG A 58 -6.551 6.003 13.738 1.00 0.82 C
|
| 481 |
+
ATOM 459 NE ARG A 58 -5.333 6.759 14.014 1.00 0.77 N
|
| 482 |
+
ATOM 460 NH1 ARG A 58 -6.405 8.786 14.289 1.00 0.64 N
|
| 483 |
+
ATOM 461 NH2 ARG A 58 -4.128 8.650 14.509 1.00 0.59 N
|
| 484 |
+
ATOM 462 CZ ARG A 58 -5.291 8.063 14.270 1.00 0.75 C
|
| 485 |
+
ATOM 463 N GLY A 59 -3.702 2.248 10.206 1.00 0.95 N
|
| 486 |
+
ATOM 464 CA GLY A 59 -3.182 2.097 8.857 1.00 0.95 C
|
| 487 |
+
ATOM 465 C GLY A 59 -1.700 2.406 8.749 1.00 0.95 C
|
| 488 |
+
ATOM 466 O GLY A 59 -0.945 2.190 9.699 1.00 0.95 O
|
| 489 |
+
ATOM 467 N TYR A 60 -1.248 2.930 7.637 1.00 0.96 N
|
| 490 |
+
ATOM 468 CA TYR A 60 0.157 3.167 7.325 1.00 0.96 C
|
| 491 |
+
ATOM 469 C TYR A 60 0.457 4.660 7.265 1.00 0.96 C
|
| 492 |
+
ATOM 470 CB TYR A 60 0.531 2.505 5.996 1.00 0.96 C
|
| 493 |
+
ATOM 471 O TYR A 60 -0.380 5.451 6.824 1.00 0.95 O
|
| 494 |
+
ATOM 472 CG TYR A 60 0.342 1.008 5.988 1.00 0.95 C
|
| 495 |
+
ATOM 473 CD1 TYR A 60 1.278 0.165 6.583 1.00 0.94 C
|
| 496 |
+
ATOM 474 CD2 TYR A 60 -0.772 0.433 5.386 1.00 0.94 C
|
| 497 |
+
ATOM 475 CE1 TYR A 60 1.109 -1.216 6.577 1.00 0.94 C
|
| 498 |
+
ATOM 476 CE2 TYR A 60 -0.952 -0.946 5.374 1.00 0.94 C
|
| 499 |
+
ATOM 477 OH TYR A 60 -0.180 -3.127 5.962 1.00 0.90 O
|
| 500 |
+
ATOM 478 CZ TYR A 60 -0.008 -1.761 5.971 1.00 0.93 C
|
| 501 |
+
ATOM 479 N VAL A 61 1.559 5.020 7.726 1.00 0.95 N
|
| 502 |
+
ATOM 480 CA VAL A 61 2.043 6.393 7.628 1.00 0.95 C
|
| 503 |
+
ATOM 481 C VAL A 61 3.484 6.399 7.124 1.00 0.95 C
|
| 504 |
+
ATOM 482 CB VAL A 61 1.950 7.126 8.985 1.00 0.93 C
|
| 505 |
+
ATOM 483 O VAL A 61 4.263 5.496 7.439 1.00 0.94 O
|
| 506 |
+
ATOM 484 CG1 VAL A 61 0.504 7.172 9.476 1.00 0.75 C
|
| 507 |
+
ATOM 485 CG2 VAL A 61 2.847 6.448 10.019 1.00 0.76 C
|
| 508 |
+
ATOM 486 N LEU A 62 3.751 7.326 6.213 1.00 0.95 N
|
| 509 |
+
ATOM 487 CA LEU A 62 5.136 7.577 5.829 1.00 0.95 C
|
| 510 |
+
ATOM 488 C LEU A 62 5.806 8.537 6.807 1.00 0.94 C
|
| 511 |
+
ATOM 489 CB LEU A 62 5.205 8.146 4.410 1.00 0.94 C
|
| 512 |
+
ATOM 490 O LEU A 62 5.536 9.740 6.785 1.00 0.92 O
|
| 513 |
+
ATOM 491 CG LEU A 62 6.604 8.327 3.819 1.00 0.89 C
|
| 514 |
+
ATOM 492 CD1 LEU A 62 7.315 6.982 3.717 1.00 0.83 C
|
| 515 |
+
ATOM 493 CD2 LEU A 62 6.525 9.001 2.453 1.00 0.82 C
|
| 516 |
+
ATOM 494 N ALA A 63 6.595 7.972 7.715 1.00 0.91 N
|
| 517 |
+
ATOM 495 CA ALA A 63 7.245 8.760 8.759 1.00 0.91 C
|
| 518 |
+
ATOM 496 C ALA A 63 8.462 9.500 8.211 1.00 0.88 C
|
| 519 |
+
ATOM 497 CB ALA A 63 7.653 7.864 9.926 1.00 0.87 C
|
| 520 |
+
ATOM 498 O ALA A 63 9.150 9.000 7.317 1.00 0.84 O
|
| 521 |
+
ATOM 499 N GLY A 64 8.844 10.801 8.768 1.00 0.79 N
|
| 522 |
+
ATOM 500 CA GLY A 64 10.029 11.551 8.385 1.00 0.77 C
|
| 523 |
+
ATOM 501 C GLY A 64 9.839 12.359 7.115 1.00 0.76 C
|
| 524 |
+
ATOM 502 O GLY A 64 10.811 12.831 6.522 1.00 0.70 O
|
| 525 |
+
ATOM 503 N GLY A 65 8.383 12.399 6.587 1.00 0.54 N
|
| 526 |
+
ATOM 504 CA GLY A 65 8.133 13.333 5.501 1.00 0.54 C
|
| 527 |
+
ATOM 505 C GLY A 65 7.712 14.709 5.983 1.00 0.53 C
|
| 528 |
+
ATOM 506 O GLY A 65 7.142 14.845 7.067 1.00 0.51 O
|
esm/mcp_output/predictions/prediction_20250830_220641.pdb
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
HEADER 18-OCT-22
|
| 2 |
+
TITLE ESMFOLD V1 PREDICTION FOR INPUT
|
| 3 |
+
REMARK 1
|
| 4 |
+
REMARK 1 REFERENCE 1
|
| 5 |
+
REMARK 1 AUTH ZEMING LIN, HALIL AKIN, ROSHAN RAO, BRIAN HIE, ZHONGKAI ZHU,
|
| 6 |
+
REMARK 1 AUTH 2 WENTING LU, NIKITA SMETANIN, ROBERT VERKUIL, ORI KABELI,
|
| 7 |
+
REMARK 1 AUTH 3 YANIV SHMUELI, ALLAN DOS SANTOS COSTA,
|
| 8 |
+
REMARK 1 AUTH 4 MARYAM FAZEL-ZARANDI, TOM SERCU, SALVATORE CANDIDO,
|
| 9 |
+
REMARK 1 AUTH 5 ALEXANDER RIVES
|
| 10 |
+
REMARK 1 TITL EVOLUTIONARY-SCALE PREDICTION OF ATOMIC LEVEL PROTEIN
|
| 11 |
+
REMARK 1 TITL 2 STRUCTURE WITH A LANGUAGE MODEL
|
| 12 |
+
REMARK 1 REF
|
| 13 |
+
REMARK 1 REFN
|
| 14 |
+
REMARK 1 PMID
|
| 15 |
+
REMARK 1 DOI 10.1101/2022.07.20.500902
|
| 16 |
+
REMARK 1
|
| 17 |
+
REMARK 1 LICENSE AND DISCLAIMERS
|
| 18 |
+
REMARK 1 ESM METAGENOMIC ATLAS DATA IS AVAILABLE UNDER
|
| 19 |
+
REMARK 1 A CC-BY-4.0 LICENSE FOR ACADEMIC AND COMMERCIAL USE.
|
| 20 |
+
REMARK 1 COPYRIGHT (C) META PLATFORMS, INC. ALL RIGHTS RESERVED.
|
| 21 |
+
REMARK 1 USE OF THE ESM METAGENOMIC ATLAS DATA IS SUBJECT
|
| 22 |
+
REMARK 1 TO THE META OPEN SOURCE TERMS OF USE AND PRIVACY POLICY.
|
| 23 |
+
ATOM 1 N MET A 1 12.955 22.762 2.808 1.00 0.40 N
|
| 24 |
+
ATOM 2 CA MET A 1 13.442 21.402 3.023 1.00 0.43 C
|
| 25 |
+
ATOM 3 C MET A 1 12.281 20.416 3.108 1.00 0.41 C
|
| 26 |
+
ATOM 4 CB MET A 1 14.285 21.328 4.297 1.00 0.37 C
|
| 27 |
+
ATOM 5 O MET A 1 11.322 20.643 3.847 1.00 0.40 O
|
| 28 |
+
ATOM 6 CG MET A 1 15.524 20.457 4.162 1.00 0.36 C
|
| 29 |
+
ATOM 7 SD MET A 1 16.674 20.646 5.579 1.00 0.46 S
|
| 30 |
+
ATOM 8 CE MET A 1 16.455 19.037 6.387 1.00 0.35 C
|
| 31 |
+
ATOM 9 N LYS A 2 11.743 19.862 2.050 1.00 0.44 N
|
| 32 |
+
ATOM 10 CA LYS A 2 10.680 18.862 2.091 1.00 0.48 C
|
| 33 |
+
ATOM 11 C LYS A 2 10.854 17.924 3.282 1.00 0.45 C
|
| 34 |
+
ATOM 12 CB LYS A 2 10.648 18.058 0.791 1.00 0.41 C
|
| 35 |
+
ATOM 13 O LYS A 2 11.967 17.481 3.573 1.00 0.44 O
|
| 36 |
+
ATOM 14 CG LYS A 2 9.807 18.691 -0.308 1.00 0.40 C
|
| 37 |
+
ATOM 15 CD LYS A 2 9.743 17.804 -1.544 1.00 0.44 C
|
| 38 |
+
ATOM 16 CE LYS A 2 8.931 18.452 -2.657 1.00 0.42 C
|
| 39 |
+
ATOM 17 NZ LYS A 2 8.883 17.595 -3.880 1.00 0.37 N
|
| 40 |
+
ATOM 18 N THR A 3 10.260 18.153 4.498 1.00 0.59 N
|
| 41 |
+
ATOM 19 CA THR A 3 10.394 17.359 5.714 1.00 0.62 C
|
| 42 |
+
ATOM 20 C THR A 3 10.444 15.870 5.386 1.00 0.57 C
|
| 43 |
+
ATOM 21 CB THR A 3 9.235 17.633 6.691 1.00 0.52 C
|
| 44 |
+
ATOM 22 O THR A 3 10.034 15.454 4.300 1.00 0.52 O
|
| 45 |
+
ATOM 23 CG2 THR A 3 9.446 18.945 7.440 1.00 0.42 C
|
| 46 |
+
ATOM 24 OG1 THR A 3 8.007 17.707 5.957 1.00 0.44 O
|
| 47 |
+
ATOM 25 N VAL A 4 11.392 14.978 5.807 1.00 0.55 N
|
| 48 |
+
ATOM 26 CA VAL A 4 11.478 13.527 5.688 1.00 0.55 C
|
| 49 |
+
ATOM 27 C VAL A 4 10.083 12.942 5.476 1.00 0.55 C
|
| 50 |
+
ATOM 28 CB VAL A 4 12.139 12.893 6.932 1.00 0.50 C
|
| 51 |
+
ATOM 29 O VAL A 4 9.905 12.019 4.678 1.00 0.54 O
|
| 52 |
+
ATOM 30 CG1 VAL A 4 12.097 11.368 6.849 1.00 0.40 C
|
| 53 |
+
ATOM 31 CG2 VAL A 4 13.578 13.384 7.080 1.00 0.41 C
|
| 54 |
+
ATOM 32 N ARG A 5 9.045 13.367 6.280 1.00 0.59 N
|
| 55 |
+
ATOM 33 CA ARG A 5 7.679 12.859 6.208 1.00 0.59 C
|
| 56 |
+
ATOM 34 C ARG A 5 7.124 12.979 4.793 1.00 0.58 C
|
| 57 |
+
ATOM 35 CB ARG A 5 6.775 13.607 7.191 1.00 0.55 C
|
| 58 |
+
ATOM 36 O ARG A 5 6.507 12.043 4.280 1.00 0.57 O
|
| 59 |
+
ATOM 37 CG ARG A 5 5.520 12.841 7.578 1.00 0.51 C
|
| 60 |
+
ATOM 38 CD ARG A 5 4.719 13.573 8.645 1.00 0.53 C
|
| 61 |
+
ATOM 39 NE ARG A 5 3.461 12.892 8.938 1.00 0.46 N
|
| 62 |
+
ATOM 40 NH1 ARG A 5 2.718 14.472 10.450 1.00 0.36 N
|
| 63 |
+
ATOM 41 NH2 ARG A 5 1.430 12.635 9.979 1.00 0.32 N
|
| 64 |
+
ATOM 42 CZ ARG A 5 2.539 13.335 9.788 1.00 0.47 C
|
| 65 |
+
ATOM 43 N GLN A 6 7.225 14.159 4.296 1.00 0.54 N
|
| 66 |
+
ATOM 44 CA GLN A 6 6.739 14.443 2.950 1.00 0.53 C
|
| 67 |
+
ATOM 45 C GLN A 6 7.437 13.564 1.916 1.00 0.54 C
|
| 68 |
+
ATOM 46 CB GLN A 6 6.942 15.920 2.607 1.00 0.49 C
|
| 69 |
+
ATOM 47 O GLN A 6 6.804 13.084 0.974 1.00 0.53 O
|
| 70 |
+
ATOM 48 CG GLN A 6 5.903 16.844 3.227 1.00 0.46 C
|
| 71 |
+
ATOM 49 CD GLN A 6 6.171 18.309 2.935 1.00 0.48 C
|
| 72 |
+
ATOM 50 NE2 GLN A 6 5.132 19.132 3.031 1.00 0.42 N
|
| 73 |
+
ATOM 51 OE1 GLN A 6 7.302 18.697 2.626 1.00 0.54 O
|
| 74 |
+
ATOM 52 N GLU A 7 8.751 13.356 2.208 1.00 0.53 N
|
| 75 |
+
ATOM 53 CA GLU A 7 9.522 12.566 1.253 1.00 0.51 C
|
| 76 |
+
ATOM 54 C GLU A 7 9.048 11.115 1.229 1.00 0.52 C
|
| 77 |
+
ATOM 55 CB GLU A 7 11.015 12.627 1.585 1.00 0.49 C
|
| 78 |
+
ATOM 56 O GLU A 7 8.959 10.504 0.162 1.00 0.52 O
|
| 79 |
+
ATOM 57 CG GLU A 7 11.700 13.898 1.106 1.00 0.45 C
|
| 80 |
+
ATOM 58 CD GLU A 7 13.193 13.920 1.394 1.00 0.47 C
|
| 81 |
+
ATOM 59 OE1 GLU A 7 13.697 12.984 2.054 1.00 0.53 O
|
| 82 |
+
ATOM 60 OE2 GLU A 7 13.863 14.882 0.956 1.00 0.48 O
|
| 83 |
+
ATOM 61 N ARG A 8 8.906 10.459 2.342 1.00 0.54 N
|
| 84 |
+
ATOM 62 CA ARG A 8 8.501 9.057 2.381 1.00 0.53 C
|
| 85 |
+
ATOM 63 C ARG A 8 7.127 8.866 1.748 1.00 0.54 C
|
| 86 |
+
ATOM 64 CB ARG A 8 8.491 8.540 3.821 1.00 0.52 C
|
| 87 |
+
ATOM 65 O ARG A 8 6.894 7.883 1.042 1.00 0.53 O
|
| 88 |
+
ATOM 66 CG ARG A 8 9.859 8.124 4.336 1.00 0.50 C
|
| 89 |
+
ATOM 67 CD ARG A 8 9.777 7.520 5.731 1.00 0.51 C
|
| 90 |
+
ATOM 68 NE ARG A 8 11.087 7.083 6.207 1.00 0.45 N
|
| 91 |
+
ATOM 69 NH1 ARG A 8 10.307 6.243 8.213 1.00 0.37 N
|
| 92 |
+
ATOM 70 NH2 ARG A 8 12.540 6.130 7.708 1.00 0.33 N
|
| 93 |
+
ATOM 71 CZ ARG A 8 11.308 6.486 7.375 1.00 0.48 C
|
| 94 |
+
ATOM 72 N LEU A 9 6.231 9.830 2.211 1.00 0.55 N
|
| 95 |
+
ATOM 73 CA LEU A 9 4.899 9.731 1.624 1.00 0.54 C
|
| 96 |
+
ATOM 74 C LEU A 9 4.969 9.810 0.103 1.00 0.55 C
|
| 97 |
+
ATOM 75 CB LEU A 9 3.992 10.840 2.163 1.00 0.52 C
|
| 98 |
+
ATOM 76 O LEU A 9 4.234 9.107 -0.594 1.00 0.55 O
|
| 99 |
+
ATOM 77 CG LEU A 9 3.348 10.585 3.527 1.00 0.50 C
|
| 100 |
+
ATOM 78 CD1 LEU A 9 2.845 11.893 4.128 1.00 0.45 C
|
| 101 |
+
ATOM 79 CD2 LEU A 9 2.211 9.577 3.402 1.00 0.47 C
|
| 102 |
+
ATOM 80 N LEU A 10 5.770 10.725 -0.331 1.00 0.49 N
|
| 103 |
+
ATOM 81 CA LEU A 10 5.935 10.895 -1.770 1.00 0.48 C
|
| 104 |
+
ATOM 82 C LEU A 10 6.431 9.605 -2.417 1.00 0.49 C
|
| 105 |
+
ATOM 83 CB LEU A 10 6.911 12.036 -2.067 1.00 0.46 C
|
| 106 |
+
ATOM 84 O LEU A 10 5.980 9.237 -3.503 1.00 0.49 O
|
| 107 |
+
ATOM 85 CG LEU A 10 6.296 13.336 -2.587 1.00 0.43 C
|
| 108 |
+
ATOM 86 CD1 LEU A 10 6.864 14.532 -1.829 1.00 0.38 C
|
| 109 |
+
ATOM 87 CD2 LEU A 10 6.540 13.481 -4.085 1.00 0.41 C
|
| 110 |
+
ATOM 88 N LYS A 11 7.442 8.965 -1.722 1.00 0.51 N
|
| 111 |
+
ATOM 89 CA LYS A 11 8.010 7.762 -2.324 1.00 0.50 C
|
| 112 |
+
ATOM 90 C LYS A 11 6.960 6.663 -2.454 1.00 0.51 C
|
| 113 |
+
ATOM 91 CB LYS A 11 9.197 7.258 -1.501 1.00 0.48 C
|
| 114 |
+
ATOM 92 O LYS A 11 6.940 5.930 -3.445 1.00 0.51 O
|
| 115 |
+
ATOM 93 CG LYS A 11 10.514 7.947 -1.828 1.00 0.46 C
|
| 116 |
+
ATOM 94 CD LYS A 11 11.665 7.365 -1.017 1.00 0.49 C
|
| 117 |
+
ATOM 95 CE LYS A 11 12.973 8.092 -1.300 1.00 0.43 C
|
| 118 |
+
ATOM 96 NZ LYS A 11 14.104 7.528 -0.504 1.00 0.40 N
|
| 119 |
+
ATOM 97 N ILE A 12 6.178 6.565 -1.403 1.00 0.53 N
|
| 120 |
+
ATOM 98 CA ILE A 12 5.147 5.534 -1.450 1.00 0.52 C
|
| 121 |
+
ATOM 99 C ILE A 12 4.183 5.820 -2.599 1.00 0.53 C
|
| 122 |
+
ATOM 100 CB ILE A 12 4.377 5.443 -0.114 1.00 0.51 C
|
| 123 |
+
ATOM 101 O ILE A 12 3.739 4.899 -3.289 1.00 0.54 O
|
| 124 |
+
ATOM 102 CG1 ILE A 12 5.283 4.883 0.988 1.00 0.45 C
|
| 125 |
+
ATOM 103 CG2 ILE A 12 3.117 4.587 -0.274 1.00 0.46 C
|
| 126 |
+
ATOM 104 CD1 ILE A 12 4.733 5.067 2.396 1.00 0.45 C
|
| 127 |
+
ATOM 105 N SER A 13 3.703 7.056 -2.575 1.00 0.53 N
|
| 128 |
+
ATOM 106 CA SER A 13 2.823 7.481 -3.659 1.00 0.52 C
|
| 129 |
+
ATOM 107 C SER A 13 3.421 7.144 -5.020 1.00 0.53 C
|
| 130 |
+
ATOM 108 CB SER A 13 2.549 8.982 -3.570 1.00 0.50 C
|
| 131 |
+
ATOM 109 O SER A 13 2.699 6.764 -5.945 1.00 0.53 O
|
| 132 |
+
ATOM 110 OG SER A 13 1.700 9.275 -2.474 1.00 0.49 O
|
| 133 |
+
ATOM 111 N LEU A 14 4.831 7.388 -5.129 1.00 0.48 N
|
| 134 |
+
ATOM 112 CA LEU A 14 5.477 7.149 -6.415 1.00 0.47 C
|
| 135 |
+
ATOM 113 C LEU A 14 5.426 5.669 -6.781 1.00 0.48 C
|
| 136 |
+
ATOM 114 CB LEU A 14 6.930 7.629 -6.382 1.00 0.44 C
|
| 137 |
+
ATOM 115 O LEU A 14 5.233 5.321 -7.948 1.00 0.47 O
|
| 138 |
+
ATOM 116 CG LEU A 14 7.205 9.002 -6.998 1.00 0.42 C
|
| 139 |
+
ATOM 117 CD1 LEU A 14 8.185 9.785 -6.130 1.00 0.36 C
|
| 140 |
+
ATOM 118 CD2 LEU A 14 7.740 8.854 -8.418 1.00 0.40 C
|
| 141 |
+
ATOM 119 N VAL A 15 5.762 4.833 -5.746 1.00 0.52 N
|
| 142 |
+
ATOM 120 CA VAL A 15 5.724 3.409 -6.064 1.00 0.51 C
|
| 143 |
+
ATOM 121 C VAL A 15 4.335 3.031 -6.573 1.00 0.52 C
|
| 144 |
+
ATOM 122 CB VAL A 15 6.097 2.543 -4.840 1.00 0.49 C
|
| 145 |
+
ATOM 123 O VAL A 15 4.204 2.238 -7.509 1.00 0.51 O
|
| 146 |
+
ATOM 124 CG1 VAL A 15 5.908 1.060 -5.150 1.00 0.44 C
|
| 147 |
+
ATOM 125 CG2 VAL A 15 7.536 2.823 -4.408 1.00 0.46 C
|
| 148 |
+
ATOM 126 N LEU A 16 3.392 3.725 -5.916 1.00 0.55 N
|
| 149 |
+
ATOM 127 CA LEU A 16 2.029 3.405 -6.323 1.00 0.54 C
|
| 150 |
+
ATOM 128 C LEU A 16 1.741 3.935 -7.724 1.00 0.55 C
|
| 151 |
+
ATOM 129 CB LEU A 16 1.021 3.986 -5.328 1.00 0.52 C
|
| 152 |
+
ATOM 130 O LEU A 16 0.957 3.342 -8.468 1.00 0.55 O
|
| 153 |
+
ATOM 131 CG LEU A 16 0.971 3.325 -3.950 1.00 0.50 C
|
| 154 |
+
ATOM 132 CD1 LEU A 16 0.096 4.140 -3.003 1.00 0.46 C
|
| 155 |
+
ATOM 133 CD2 LEU A 16 0.458 1.893 -4.061 1.00 0.47 C
|
| 156 |
+
ATOM 134 N SER A 17 2.343 5.204 -7.894 1.00 0.49 N
|
| 157 |
+
ATOM 135 CA SER A 17 2.073 5.808 -9.195 1.00 0.48 C
|
| 158 |
+
ATOM 136 C SER A 17 2.809 5.072 -10.309 1.00 0.49 C
|
| 159 |
+
ATOM 137 CB SER A 17 2.475 7.284 -9.195 1.00 0.45 C
|
| 160 |
+
ATOM 138 O SER A 17 2.409 5.139 -11.474 1.00 0.48 O
|
| 161 |
+
ATOM 139 OG SER A 17 3.880 7.422 -9.077 1.00 0.43 O
|
| 162 |
+
ATOM 140 N GLU A 18 4.061 4.645 -9.888 1.00 0.51 N
|
| 163 |
+
ATOM 141 CA GLU A 18 4.832 3.986 -10.938 1.00 0.50 C
|
| 164 |
+
ATOM 142 C GLU A 18 4.292 2.588 -11.224 1.00 0.51 C
|
| 165 |
+
ATOM 143 CB GLU A 18 6.311 3.910 -10.552 1.00 0.47 C
|
| 166 |
+
ATOM 144 O GLU A 18 4.769 1.907 -12.134 1.00 0.50 O
|
| 167 |
+
ATOM 145 CG GLU A 18 6.999 5.266 -10.482 1.00 0.45 C
|
| 168 |
+
ATOM 146 CD GLU A 18 8.505 5.185 -10.674 1.00 0.47 C
|
| 169 |
+
ATOM 147 OE1 GLU A 18 9.058 4.062 -10.671 1.00 0.48 O
|
| 170 |
+
ATOM 148 OE2 GLU A 18 9.138 6.254 -10.827 1.00 0.42 O
|
| 171 |
+
ATOM 149 N LEU A 19 3.449 2.137 -10.297 1.00 0.52 N
|
| 172 |
+
ATOM 150 CA LEU A 19 2.839 0.883 -10.722 1.00 0.52 C
|
| 173 |
+
ATOM 151 C LEU A 19 2.051 1.073 -12.014 1.00 0.52 C
|
| 174 |
+
ATOM 152 CB LEU A 19 1.921 0.334 -9.627 1.00 0.50 C
|
| 175 |
+
ATOM 153 O LEU A 19 1.400 2.103 -12.204 1.00 0.52 O
|
| 176 |
+
ATOM 154 CG LEU A 19 2.610 -0.333 -8.436 1.00 0.49 C
|
| 177 |
+
ATOM 155 CD1 LEU A 19 1.682 -0.348 -7.226 1.00 0.47 C
|
| 178 |
+
ATOM 156 CD2 LEU A 19 3.050 -1.748 -8.796 1.00 0.48 C
|
| 179 |
+
ATOM 157 N PRO A 20 2.509 0.425 -12.947 1.00 0.52 N
|
| 180 |
+
ATOM 158 CA PRO A 20 1.748 0.593 -14.187 1.00 0.50 C
|
| 181 |
+
ATOM 159 C PRO A 20 0.239 0.494 -13.972 1.00 0.52 C
|
| 182 |
+
ATOM 160 CB PRO A 20 2.250 -0.555 -15.067 1.00 0.49 C
|
| 183 |
+
ATOM 161 O PRO A 20 -0.239 -0.467 -13.363 1.00 0.51 O
|
| 184 |
+
ATOM 162 CG PRO A 20 3.024 -1.436 -14.141 1.00 0.46 C
|
| 185 |
+
ATOM 163 CD PRO A 20 3.173 -0.730 -12.824 1.00 0.46 C
|
| 186 |
+
ATOM 164 N LEU A 21 -0.387 1.673 -13.561 1.00 0.49 N
|
| 187 |
+
ATOM 165 CA LEU A 21 -1.840 1.543 -13.546 1.00 0.49 C
|
| 188 |
+
ATOM 166 C LEU A 21 -2.334 0.805 -14.786 1.00 0.49 C
|
| 189 |
+
ATOM 167 CB LEU A 21 -2.501 2.921 -13.461 1.00 0.47 C
|
| 190 |
+
ATOM 168 O LEU A 21 -3.422 0.225 -14.776 1.00 0.48 O
|
| 191 |
+
ATOM 169 CG LEU A 21 -2.547 3.567 -12.075 1.00 0.45 C
|
| 192 |
+
ATOM 170 CD1 LEU A 21 -2.511 5.087 -12.197 1.00 0.41 C
|
| 193 |
+
ATOM 171 CD2 LEU A 21 -3.791 3.116 -11.316 1.00 0.42 C
|
| 194 |
+
ATOM 172 N GLU A 22 -1.284 0.823 -15.746 1.00 0.49 N
|
| 195 |
+
ATOM 173 CA GLU A 22 -1.690 0.325 -17.056 1.00 0.49 C
|
| 196 |
+
ATOM 174 C GLU A 22 -1.183 -1.095 -17.289 1.00 0.49 C
|
| 197 |
+
ATOM 175 CB GLU A 22 -1.184 1.252 -18.164 1.00 0.45 C
|
| 198 |
+
ATOM 176 O GLU A 22 -1.105 -1.553 -18.431 1.00 0.47 O
|
| 199 |
+
ATOM 177 CG GLU A 22 -1.870 2.611 -18.191 1.00 0.43 C
|
| 200 |
+
ATOM 178 CD GLU A 22 -1.638 3.375 -19.484 1.00 0.46 C
|
| 201 |
+
ATOM 179 OE1 GLU A 22 -0.921 2.862 -20.372 1.00 0.44 O
|
| 202 |
+
ATOM 180 OE2 GLU A 22 -2.176 4.498 -19.610 1.00 0.40 O
|
| 203 |
+
ATOM 181 N SER A 23 -0.507 -1.685 -16.243 1.00 0.48 N
|
| 204 |
+
ATOM 182 CA SER A 23 -0.409 -3.087 -16.636 1.00 0.47 C
|
| 205 |
+
ATOM 183 C SER A 23 -1.772 -3.771 -16.589 1.00 0.48 C
|
| 206 |
+
ATOM 184 CB SER A 23 0.575 -3.830 -15.732 1.00 0.45 C
|
| 207 |
+
ATOM 185 O SER A 23 -2.460 -3.724 -15.568 1.00 0.46 O
|
| 208 |
+
ATOM 186 OG SER A 23 0.373 -3.480 -14.374 1.00 0.42 O
|
| 209 |
+
ATOM 187 N LYS A 24 -2.783 -3.245 -17.333 1.00 0.48 N
|
| 210 |
+
ATOM 188 CA LYS A 24 -3.816 -4.243 -17.597 1.00 0.47 C
|
| 211 |
+
ATOM 189 C LYS A 24 -3.258 -5.658 -17.476 1.00 0.48 C
|
| 212 |
+
ATOM 190 CB LYS A 24 -4.420 -4.035 -18.987 1.00 0.46 C
|
| 213 |
+
ATOM 191 O LYS A 24 -2.311 -6.020 -18.177 1.00 0.46 O
|
| 214 |
+
ATOM 192 CG LYS A 24 -5.249 -2.766 -19.119 1.00 0.44 C
|
| 215 |
+
ATOM 193 CD LYS A 24 -6.028 -2.741 -20.427 1.00 0.46 C
|
| 216 |
+
ATOM 194 CE LYS A 24 -6.820 -1.449 -20.582 1.00 0.40 C
|
| 217 |
+
ATOM 195 NZ LYS A 24 -7.573 -1.413 -21.871 1.00 0.37 N
|
| 218 |
+
ATOM 196 N PRO A 25 -3.101 -6.132 -16.224 1.00 0.48 N
|
| 219 |
+
ATOM 197 CA PRO A 25 -2.777 -7.561 -16.224 1.00 0.47 C
|
| 220 |
+
ATOM 198 C PRO A 25 -3.419 -8.311 -17.389 1.00 0.49 C
|
| 221 |
+
ATOM 199 CB PRO A 25 -3.336 -8.049 -14.886 1.00 0.46 C
|
| 222 |
+
ATOM 200 O PRO A 25 -4.457 -7.887 -17.905 1.00 0.48 O
|
| 223 |
+
ATOM 201 CG PRO A 25 -4.170 -6.915 -14.383 1.00 0.44 C
|
| 224 |
+
ATOM 202 CD PRO A 25 -3.916 -5.715 -15.250 1.00 0.45 C
|
| 225 |
+
ATOM 203 N GLU A 26 -2.595 -8.749 -18.537 1.00 0.52 N
|
| 226 |
+
ATOM 204 CA GLU A 26 -3.262 -9.728 -19.391 1.00 0.52 C
|
| 227 |
+
ATOM 205 C GLU A 26 -4.559 -10.221 -18.756 1.00 0.52 C
|
| 228 |
+
ATOM 206 CB GLU A 26 -2.334 -10.911 -19.678 1.00 0.49 C
|
| 229 |
+
ATOM 207 O GLU A 26 -4.681 -10.258 -17.530 1.00 0.50 O
|
| 230 |
+
ATOM 208 CG GLU A 26 -1.229 -10.598 -20.678 1.00 0.48 C
|
| 231 |
+
ATOM 209 CD GLU A 26 -0.782 -11.811 -21.477 1.00 0.50 C
|
| 232 |
+
ATOM 210 OE1 GLU A 26 -1.195 -12.945 -21.142 1.00 0.51 O
|
| 233 |
+
ATOM 211 OE2 GLU A 26 -0.012 -11.627 -22.446 1.00 0.47 O
|
| 234 |
+
ATOM 212 N PRO A 27 -5.776 -9.853 -19.347 1.00 0.50 N
|
| 235 |
+
ATOM 213 CA PRO A 27 -6.879 -10.502 -18.634 1.00 0.49 C
|
| 236 |
+
ATOM 214 C PRO A 27 -6.418 -11.681 -17.781 1.00 0.50 C
|
| 237 |
+
ATOM 215 CB PRO A 27 -7.800 -10.971 -19.763 1.00 0.47 C
|
| 238 |
+
ATOM 216 O PRO A 27 -5.769 -12.599 -18.290 1.00 0.49 O
|
| 239 |
+
ATOM 217 CG PRO A 27 -7.065 -10.637 -21.020 1.00 0.45 C
|
| 240 |
+
ATOM 218 CD PRO A 27 -5.782 -9.947 -20.654 1.00 0.45 C
|
| 241 |
+
ATOM 219 N VAL A 28 -5.550 -11.387 -16.664 1.00 0.49 N
|
| 242 |
+
ATOM 220 CA VAL A 28 -5.461 -12.521 -15.750 1.00 0.48 C
|
| 243 |
+
ATOM 221 C VAL A 28 -6.814 -13.223 -15.665 1.00 0.49 C
|
| 244 |
+
ATOM 222 CB VAL A 28 -4.999 -12.081 -14.343 1.00 0.47 C
|
| 245 |
+
ATOM 223 O VAL A 28 -7.852 -12.571 -15.529 1.00 0.48 O
|
| 246 |
+
ATOM 224 CG1 VAL A 28 -4.676 -13.297 -13.476 1.00 0.45 C
|
| 247 |
+
ATOM 225 CG2 VAL A 28 -3.787 -11.157 -14.445 1.00 0.46 C
|
| 248 |
+
ATOM 226 N GLN A 29 -7.108 -14.082 -16.749 1.00 0.53 N
|
| 249 |
+
ATOM 227 CA GLN A 29 -8.211 -15.035 -16.676 1.00 0.53 C
|
| 250 |
+
ATOM 228 C GLN A 29 -8.987 -14.878 -15.372 1.00 0.53 C
|
| 251 |
+
ATOM 229 CB GLN A 29 -7.692 -16.467 -16.810 1.00 0.48 C
|
| 252 |
+
ATOM 230 O GLN A 29 -8.402 -14.913 -14.287 1.00 0.51 O
|
| 253 |
+
ATOM 231 CG GLN A 29 -7.428 -16.894 -18.247 1.00 0.46 C
|
| 254 |
+
ATOM 232 CD GLN A 29 -7.062 -18.362 -18.366 1.00 0.48 C
|
| 255 |
+
ATOM 233 NE2 GLN A 29 -6.750 -18.799 -19.581 1.00 0.41 N
|
| 256 |
+
ATOM 234 OE1 GLN A 29 -7.058 -19.097 -17.373 1.00 0.50 O
|
| 257 |
+
ATOM 235 N GLY A 30 -9.845 -13.707 -15.208 1.00 0.58 N
|
| 258 |
+
ATOM 236 CA GLY A 30 -11.064 -13.567 -14.428 1.00 0.57 C
|
| 259 |
+
ATOM 237 C GLY A 30 -10.991 -12.454 -13.400 1.00 0.58 C
|
| 260 |
+
ATOM 238 O GLY A 30 -9.936 -12.218 -12.807 1.00 0.55 O
|
| 261 |
+
ATOM 239 N ALA A 31 -11.469 -11.174 -13.727 1.00 0.65 N
|
| 262 |
+
ATOM 240 CA ALA A 31 -11.865 -10.061 -12.868 1.00 0.64 C
|
| 263 |
+
ATOM 241 C ALA A 31 -11.715 -10.426 -11.394 1.00 0.65 C
|
| 264 |
+
ATOM 242 CB ALA A 31 -13.303 -9.645 -13.166 1.00 0.61 C
|
| 265 |
+
ATOM 243 O ALA A 31 -11.286 -9.600 -10.584 1.00 0.65 O
|
| 266 |
+
ATOM 244 N ALA A 32 -12.034 -11.668 -11.139 1.00 0.68 N
|
| 267 |
+
ATOM 245 CA ALA A 32 -11.937 -12.088 -9.744 1.00 0.67 C
|
| 268 |
+
ATOM 246 C ALA A 32 -10.490 -12.054 -9.260 1.00 0.68 C
|
| 269 |
+
ATOM 247 CB ALA A 32 -12.521 -13.487 -9.567 1.00 0.64 C
|
| 270 |
+
ATOM 248 O ALA A 32 -10.211 -11.605 -8.146 1.00 0.67 O
|
| 271 |
+
ATOM 249 N LEU A 33 -9.591 -12.548 -10.047 1.00 0.67 N
|
| 272 |
+
ATOM 250 CA LEU A 33 -8.192 -12.557 -9.631 1.00 0.66 C
|
| 273 |
+
ATOM 251 C LEU A 33 -7.660 -11.135 -9.484 1.00 0.67 C
|
| 274 |
+
ATOM 252 CB LEU A 33 -7.339 -13.332 -10.638 1.00 0.63 C
|
| 275 |
+
ATOM 253 O LEU A 33 -6.890 -10.849 -8.564 1.00 0.65 O
|
| 276 |
+
ATOM 254 CG LEU A 33 -5.950 -13.761 -10.161 1.00 0.59 C
|
| 277 |
+
ATOM 255 CD1 LEU A 33 -6.019 -15.128 -9.488 1.00 0.54 C
|
| 278 |
+
ATOM 256 CD2 LEU A 33 -4.967 -13.783 -11.326 1.00 0.55 C
|
| 279 |
+
ATOM 257 N GLN A 34 -8.043 -10.287 -10.357 1.00 0.68 N
|
| 280 |
+
ATOM 258 CA GLN A 34 -7.625 -8.892 -10.272 1.00 0.68 C
|
| 281 |
+
ATOM 259 C GLN A 34 -8.075 -8.262 -8.957 1.00 0.69 C
|
| 282 |
+
ATOM 260 CB GLN A 34 -8.176 -8.093 -11.454 1.00 0.65 C
|
| 283 |
+
ATOM 261 O GLN A 34 -7.304 -7.552 -8.307 1.00 0.68 O
|
| 284 |
+
ATOM 262 CG GLN A 34 -7.476 -6.759 -11.674 1.00 0.61 C
|
| 285 |
+
ATOM 263 CD GLN A 34 -7.961 -6.040 -12.920 1.00 0.59 C
|
| 286 |
+
ATOM 264 NE2 GLN A 34 -7.359 -4.893 -13.214 1.00 0.48 N
|
| 287 |
+
ATOM 265 OE1 GLN A 34 -8.868 -6.513 -13.612 1.00 0.55 O
|
| 288 |
+
ATOM 266 N ALA A 35 -9.311 -8.425 -8.724 1.00 0.72 N
|
| 289 |
+
ATOM 267 CA ALA A 35 -9.858 -7.851 -7.498 1.00 0.71 C
|
| 290 |
+
ATOM 268 C ALA A 35 -9.115 -8.371 -6.270 1.00 0.73 C
|
| 291 |
+
ATOM 269 CB ALA A 35 -11.348 -8.161 -7.384 1.00 0.70 C
|
| 292 |
+
ATOM 270 O ALA A 35 -8.849 -7.616 -5.332 1.00 0.73 O
|
| 293 |
+
ATOM 271 N GLU A 36 -8.800 -9.602 -6.322 1.00 0.75 N
|
| 294 |
+
ATOM 272 CA GLU A 36 -8.080 -10.212 -5.208 1.00 0.75 C
|
| 295 |
+
ATOM 273 C GLU A 36 -6.689 -9.603 -5.050 1.00 0.76 C
|
| 296 |
+
ATOM 274 CB GLU A 36 -7.971 -11.726 -5.403 1.00 0.73 C
|
| 297 |
+
ATOM 275 O GLU A 36 -6.256 -9.313 -3.933 1.00 0.75 O
|
| 298 |
+
ATOM 276 CG GLU A 36 -7.373 -12.458 -4.210 1.00 0.68 C
|
| 299 |
+
ATOM 277 CD GLU A 36 -7.447 -13.972 -4.336 1.00 0.65 C
|
| 300 |
+
ATOM 278 OE1 GLU A 36 -8.001 -14.471 -5.342 1.00 0.64 O
|
| 301 |
+
ATOM 279 OE2 GLU A 36 -6.948 -14.664 -3.421 1.00 0.59 O
|
| 302 |
+
ATOM 280 N LEU A 37 -5.986 -9.435 -6.129 1.00 0.74 N
|
| 303 |
+
ATOM 281 CA LEU A 37 -4.628 -8.906 -6.060 1.00 0.73 C
|
| 304 |
+
ATOM 282 C LEU A 37 -4.630 -7.469 -5.549 1.00 0.74 C
|
| 305 |
+
ATOM 283 CB LEU A 37 -3.957 -8.970 -7.435 1.00 0.71 C
|
| 306 |
+
ATOM 284 O LEU A 37 -3.797 -7.100 -4.718 1.00 0.73 O
|
| 307 |
+
ATOM 285 CG LEU A 37 -3.472 -10.348 -7.888 1.00 0.66 C
|
| 308 |
+
ATOM 286 CD1 LEU A 37 -3.167 -10.337 -9.382 1.00 0.61 C
|
| 309 |
+
ATOM 287 CD2 LEU A 37 -2.244 -10.772 -7.090 1.00 0.61 C
|
| 310 |
+
ATOM 288 N LEU A 38 -5.626 -6.678 -6.066 1.00 0.73 N
|
| 311 |
+
ATOM 289 CA LEU A 38 -5.729 -5.296 -5.610 1.00 0.73 C
|
| 312 |
+
ATOM 290 C LEU A 38 -6.038 -5.238 -4.118 1.00 0.75 C
|
| 313 |
+
ATOM 291 CB LEU A 38 -6.811 -4.551 -6.397 1.00 0.71 C
|
| 314 |
+
ATOM 292 O LEU A 38 -5.501 -4.391 -3.401 1.00 0.75 O
|
| 315 |
+
ATOM 293 CG LEU A 38 -6.433 -4.102 -7.809 1.00 0.66 C
|
| 316 |
+
ATOM 294 CD1 LEU A 38 -7.679 -3.690 -8.586 1.00 0.60 C
|
| 317 |
+
ATOM 295 CD2 LEU A 38 -5.428 -2.956 -7.755 1.00 0.60 C
|
| 318 |
+
ATOM 296 N SER A 39 -6.898 -6.106 -3.777 1.00 0.77 N
|
| 319 |
+
ATOM 297 CA SER A 39 -7.237 -6.146 -2.358 1.00 0.77 C
|
| 320 |
+
ATOM 298 C SER A 39 -6.013 -6.466 -1.507 1.00 0.78 C
|
| 321 |
+
ATOM 299 CB SER A 39 -8.333 -7.179 -2.098 1.00 0.75 C
|
| 322 |
+
ATOM 300 O SER A 39 -5.816 -5.871 -0.445 1.00 0.78 O
|
| 323 |
+
ATOM 301 OG SER A 39 -8.631 -7.259 -0.715 1.00 0.68 O
|
| 324 |
+
ATOM 302 N GLN A 40 -5.194 -7.402 -1.959 1.00 0.76 N
|
| 325 |
+
ATOM 303 CA GLN A 40 -4.005 -7.786 -1.205 1.00 0.76 C
|
| 326 |
+
ATOM 304 C GLN A 40 -3.014 -6.629 -1.113 1.00 0.76 C
|
| 327 |
+
ATOM 305 CB GLN A 40 -3.334 -9.003 -1.843 1.00 0.73 C
|
| 328 |
+
ATOM 306 O GLN A 40 -2.433 -6.384 -0.054 1.00 0.76 O
|
| 329 |
+
ATOM 307 CG GLN A 40 -4.050 -10.317 -1.563 1.00 0.66 C
|
| 330 |
+
ATOM 308 CD GLN A 40 -3.515 -11.466 -2.398 1.00 0.63 C
|
| 331 |
+
ATOM 309 NE2 GLN A 40 -4.359 -12.462 -2.645 1.00 0.54 N
|
| 332 |
+
ATOM 310 OE1 GLN A 40 -2.354 -11.457 -2.820 1.00 0.60 O
|
| 333 |
+
ATOM 311 N VAL A 41 -2.792 -5.944 -2.237 1.00 0.75 N
|
| 334 |
+
ATOM 312 CA VAL A 41 -1.862 -4.820 -2.237 1.00 0.74 C
|
| 335 |
+
ATOM 313 C VAL A 41 -2.341 -3.752 -1.256 1.00 0.75 C
|
| 336 |
+
ATOM 314 CB VAL A 41 -1.704 -4.213 -3.649 1.00 0.73 C
|
| 337 |
+
ATOM 315 O VAL A 41 -1.550 -3.215 -0.478 1.00 0.74 O
|
| 338 |
+
ATOM 316 CG1 VAL A 41 -0.947 -2.888 -3.586 1.00 0.65 C
|
| 339 |
+
ATOM 317 CG2 VAL A 41 -0.989 -5.196 -4.575 1.00 0.66 C
|
| 340 |
+
ATOM 318 N ARG A 42 -3.621 -3.448 -1.361 1.00 0.76 N
|
| 341 |
+
ATOM 319 CA ARG A 42 -4.187 -2.474 -0.434 1.00 0.76 C
|
| 342 |
+
ATOM 320 C ARG A 42 -3.930 -2.881 1.013 1.00 0.77 C
|
| 343 |
+
ATOM 321 CB ARG A 42 -5.690 -2.314 -0.673 1.00 0.74 C
|
| 344 |
+
ATOM 322 O ARG A 42 -3.588 -2.042 1.849 1.00 0.77 O
|
| 345 |
+
ATOM 323 CG ARG A 42 -6.034 -1.420 -1.855 1.00 0.69 C
|
| 346 |
+
ATOM 324 CD ARG A 42 -7.539 -1.280 -2.036 1.00 0.65 C
|
| 347 |
+
ATOM 325 NE ARG A 42 -7.867 -0.358 -3.120 1.00 0.59 N
|
| 348 |
+
ATOM 326 NH1 ARG A 42 -10.131 -0.814 -3.100 1.00 0.49 N
|
| 349 |
+
ATOM 327 NH2 ARG A 42 -9.276 0.705 -4.589 1.00 0.45 N
|
| 350 |
+
ATOM 328 CZ ARG A 42 -9.091 -0.158 -3.601 1.00 0.59 C
|
| 351 |
+
ATOM 329 N GLN A 43 -4.215 -4.159 1.316 1.00 0.78 N
|
| 352 |
+
ATOM 330 CA GLN A 43 -4.020 -4.652 2.675 1.00 0.78 C
|
| 353 |
+
ATOM 331 C GLN A 43 -2.557 -4.540 3.096 1.00 0.78 C
|
| 354 |
+
ATOM 332 CB GLN A 43 -4.490 -6.103 2.793 1.00 0.76 C
|
| 355 |
+
ATOM 333 O GLN A 43 -2.260 -4.175 4.235 1.00 0.77 O
|
| 356 |
+
ATOM 334 CG GLN A 43 -4.504 -6.630 4.222 1.00 0.70 C
|
| 357 |
+
ATOM 335 CD GLN A 43 -5.452 -5.861 5.123 1.00 0.66 C
|
| 358 |
+
ATOM 336 NE2 GLN A 43 -4.992 -5.534 6.326 1.00 0.60 N
|
| 359 |
+
ATOM 337 OE1 GLN A 43 -6.588 -5.562 4.740 1.00 0.66 O
|
| 360 |
+
ATOM 338 N ASP A 44 -1.615 -4.885 2.249 1.00 0.76 N
|
| 361 |
+
ATOM 339 CA ASP A 44 -0.192 -4.815 2.566 1.00 0.75 C
|
| 362 |
+
ATOM 340 C ASP A 44 0.231 -3.383 2.883 1.00 0.75 C
|
| 363 |
+
ATOM 341 CB ASP A 44 0.643 -5.366 1.409 1.00 0.73 C
|
| 364 |
+
ATOM 342 O ASP A 44 1.022 -3.152 3.800 1.00 0.74 O
|
| 365 |
+
ATOM 343 CG ASP A 44 0.532 -6.874 1.262 1.00 0.68 C
|
| 366 |
+
ATOM 344 OD1 ASP A 44 0.048 -7.544 2.199 1.00 0.65 O
|
| 367 |
+
ATOM 345 OD2 ASP A 44 0.935 -7.396 0.200 1.00 0.66 O
|
| 368 |
+
ATOM 346 N ILE A 45 -0.307 -2.445 2.112 1.00 0.76 N
|
| 369 |
+
ATOM 347 CA ILE A 45 -0.006 -1.042 2.371 1.00 0.75 C
|
| 370 |
+
ATOM 348 C ILE A 45 -0.514 -0.652 3.758 1.00 0.76 C
|
| 371 |
+
ATOM 349 CB ILE A 45 -0.625 -0.122 1.295 1.00 0.73 C
|
| 372 |
+
ATOM 350 O ILE A 45 0.201 -0.011 4.531 1.00 0.75 O
|
| 373 |
+
ATOM 351 CG1 ILE A 45 0.082 -0.322 -0.050 1.00 0.67 C
|
| 374 |
+
ATOM 352 CG2 ILE A 45 -0.561 1.343 1.735 1.00 0.67 C
|
| 375 |
+
ATOM 353 CD1 ILE A 45 -0.609 0.363 -1.221 1.00 0.64 C
|
| 376 |
+
ATOM 354 N ALA A 46 -1.744 -0.973 3.967 1.00 0.76 N
|
| 377 |
+
ATOM 355 CA ALA A 46 -2.318 -0.646 5.269 1.00 0.75 C
|
| 378 |
+
ATOM 356 C ALA A 46 -1.481 -1.235 6.401 1.00 0.76 C
|
| 379 |
+
ATOM 357 CB ALA A 46 -3.757 -1.149 5.358 1.00 0.74 C
|
| 380 |
+
ATOM 358 O ALA A 46 -1.239 -0.572 7.413 1.00 0.75 O
|
| 381 |
+
ATOM 359 N ASN A 47 -1.100 -2.513 6.275 1.00 0.77 N
|
| 382 |
+
ATOM 360 CA ASN A 47 -0.287 -3.170 7.293 1.00 0.76 C
|
| 383 |
+
ATOM 361 C ASN A 47 1.046 -2.456 7.493 1.00 0.75 C
|
| 384 |
+
ATOM 362 CB ASN A 47 -0.053 -4.638 6.927 1.00 0.74 C
|
| 385 |
+
ATOM 363 O ASN A 47 1.503 -2.291 8.626 1.00 0.74 O
|
| 386 |
+
ATOM 364 CG ASN A 47 -1.313 -5.474 7.032 1.00 0.70 C
|
| 387 |
+
ATOM 365 ND2 ASN A 47 -1.301 -6.646 6.410 1.00 0.68 N
|
| 388 |
+
ATOM 366 OD1 ASN A 47 -2.291 -5.068 7.666 1.00 0.70 O
|
| 389 |
+
ATOM 367 N SER A 48 1.642 -2.096 6.364 1.00 0.75 N
|
| 390 |
+
ATOM 368 CA SER A 48 2.925 -1.406 6.457 1.00 0.73 C
|
| 391 |
+
ATOM 369 C SER A 48 2.787 -0.078 7.194 1.00 0.73 C
|
| 392 |
+
ATOM 370 CB SER A 48 3.508 -1.167 5.064 1.00 0.71 C
|
| 393 |
+
ATOM 371 O SER A 48 3.641 0.278 8.009 1.00 0.70 O
|
| 394 |
+
ATOM 372 OG SER A 48 3.811 -2.397 4.427 1.00 0.65 O
|
| 395 |
+
ATOM 373 N LEU A 49 1.734 0.640 6.845 1.00 0.72 N
|
| 396 |
+
ATOM 374 CA LEU A 49 1.508 1.919 7.510 1.00 0.72 C
|
| 397 |
+
ATOM 375 C LEU A 49 1.271 1.721 9.003 1.00 0.72 C
|
| 398 |
+
ATOM 376 CB LEU A 49 0.315 2.645 6.884 1.00 0.70 C
|
| 399 |
+
ATOM 377 O LEU A 49 1.763 2.499 9.823 1.00 0.71 O
|
| 400 |
+
ATOM 378 CG LEU A 49 0.534 3.228 5.487 1.00 0.66 C
|
| 401 |
+
ATOM 379 CD1 LEU A 49 -0.788 3.717 4.904 1.00 0.61 C
|
| 402 |
+
ATOM 380 CD2 LEU A 49 1.554 4.360 5.533 1.00 0.62 C
|
| 403 |
+
ATOM 381 N ASN A 50 0.475 0.748 9.327 1.00 0.74 N
|
| 404 |
+
ATOM 382 CA ASN A 50 0.188 0.475 10.732 1.00 0.73 C
|
| 405 |
+
ATOM 383 C ASN A 50 1.452 0.099 11.499 1.00 0.73 C
|
| 406 |
+
ATOM 384 CB ASN A 50 -0.860 -0.633 10.860 1.00 0.70 C
|
| 407 |
+
ATOM 385 O ASN A 50 1.622 0.493 12.654 1.00 0.71 O
|
| 408 |
+
ATOM 386 CG ASN A 50 -2.278 -0.113 10.732 1.00 0.65 C
|
| 409 |
+
ATOM 387 ND2 ASN A 50 -3.214 -1.008 10.440 1.00 0.63 N
|
| 410 |
+
ATOM 388 OD1 ASN A 50 -2.530 1.084 10.892 1.00 0.63 O
|
| 411 |
+
ATOM 389 N ALA A 51 2.296 -0.751 10.857 1.00 0.70 N
|
| 412 |
+
ATOM 390 CA ALA A 51 3.518 -1.202 11.518 1.00 0.69 C
|
| 413 |
+
ATOM 391 C ALA A 51 4.408 -0.020 11.892 1.00 0.68 C
|
| 414 |
+
ATOM 392 CB ALA A 51 4.279 -2.175 10.622 1.00 0.67 C
|
| 415 |
+
ATOM 393 O ALA A 51 5.038 -0.020 12.952 1.00 0.67 O
|
| 416 |
+
ATOM 394 N VAL A 52 4.525 0.951 11.010 1.00 0.67 N
|
| 417 |
+
ATOM 395 CA VAL A 52 5.352 2.120 11.288 1.00 0.66 C
|
| 418 |
+
ATOM 396 C VAL A 52 4.723 2.942 12.411 1.00 0.65 C
|
| 419 |
+
ATOM 397 CB VAL A 52 5.540 2.995 10.029 1.00 0.63 C
|
| 420 |
+
ATOM 398 O VAL A 52 5.431 3.495 13.256 1.00 0.64 O
|
| 421 |
+
ATOM 399 CG1 VAL A 52 6.261 4.296 10.378 1.00 0.57 C
|
| 422 |
+
ATOM 400 CG2 VAL A 52 6.309 2.226 8.956 1.00 0.58 C
|
| 423 |
+
ATOM 401 N ALA A 53 3.415 3.071 12.338 1.00 0.62 N
|
| 424 |
+
ATOM 402 CA ALA A 53 2.746 3.869 13.362 1.00 0.60 C
|
| 425 |
+
ATOM 403 C ALA A 53 2.961 3.272 14.750 1.00 0.61 C
|
| 426 |
+
ATOM 404 CB ALA A 53 1.253 3.980 13.059 1.00 0.58 C
|
| 427 |
+
ATOM 405 O ALA A 53 2.919 3.988 15.753 1.00 0.61 O
|
| 428 |
+
ATOM 406 N THR A 54 3.105 1.936 14.815 1.00 0.61 N
|
| 429 |
+
ATOM 407 CA THR A 54 3.158 1.297 16.126 1.00 0.61 C
|
| 430 |
+
ATOM 408 C THR A 54 4.591 1.253 16.648 1.00 0.61 C
|
| 431 |
+
ATOM 409 CB THR A 54 2.583 -0.131 16.075 1.00 0.57 C
|
| 432 |
+
ATOM 410 O THR A 54 4.835 0.805 17.770 1.00 0.59 O
|
| 433 |
+
ATOM 411 CG2 THR A 54 1.101 -0.114 15.712 1.00 0.51 C
|
| 434 |
+
ATOM 412 OG1 THR A 54 3.295 -0.892 15.092 1.00 0.54 O
|
| 435 |
+
ATOM 413 N ARG A 55 5.532 1.631 15.809 1.00 0.65 N
|
| 436 |
+
ATOM 414 CA ARG A 55 6.903 1.567 16.303 1.00 0.65 C
|
| 437 |
+
ATOM 415 C ARG A 55 7.118 2.544 17.453 1.00 0.65 C
|
| 438 |
+
ATOM 416 CB ARG A 55 7.895 1.859 15.175 1.00 0.61 C
|
| 439 |
+
ATOM 417 O ARG A 55 6.621 3.672 17.418 1.00 0.63 O
|
| 440 |
+
ATOM 418 CG ARG A 55 8.075 0.707 14.199 1.00 0.59 C
|
| 441 |
+
ATOM 419 CD ARG A 55 9.132 1.017 13.148 1.00 0.59 C
|
| 442 |
+
ATOM 420 NE ARG A 55 9.264 -0.067 12.180 1.00 0.52 N
|
| 443 |
+
ATOM 421 NH1 ARG A 55 10.992 0.916 11.004 1.00 0.41 N
|
| 444 |
+
ATOM 422 NH2 ARG A 55 10.179 -1.128 10.359 1.00 0.36 N
|
| 445 |
+
ATOM 423 CZ ARG A 55 10.145 -0.090 11.183 1.00 0.55 C
|
| 446 |
+
ATOM 424 N PRO A 56 7.605 2.129 18.545 1.00 0.62 N
|
| 447 |
+
ATOM 425 CA PRO A 56 7.968 3.039 19.634 1.00 0.61 C
|
| 448 |
+
ATOM 426 C PRO A 56 8.765 4.249 19.151 1.00 0.62 C
|
| 449 |
+
ATOM 427 CB PRO A 56 8.815 2.161 20.559 1.00 0.56 C
|
| 450 |
+
ATOM 428 O PRO A 56 9.656 4.109 18.310 1.00 0.59 O
|
| 451 |
+
ATOM 429 CG PRO A 56 8.612 0.769 20.056 1.00 0.53 C
|
| 452 |
+
ATOM 430 CD PRO A 56 8.067 0.841 18.659 1.00 0.55 C
|
| 453 |
+
ATOM 431 N GLY A 57 8.356 5.535 19.374 1.00 0.58 N
|
| 454 |
+
ATOM 432 CA GLY A 57 8.945 6.814 19.010 1.00 0.58 C
|
| 455 |
+
ATOM 433 C GLY A 57 8.368 7.395 17.733 1.00 0.58 C
|
| 456 |
+
ATOM 434 O GLY A 57 8.738 8.497 17.323 1.00 0.57 O
|
| 457 |
+
ATOM 435 N TYR A 58 7.679 6.470 16.960 1.00 0.54 N
|
| 458 |
+
ATOM 436 CA TYR A 58 7.100 7.135 15.799 1.00 0.54 C
|
| 459 |
+
ATOM 437 C TYR A 58 6.095 8.199 16.224 1.00 0.53 C
|
| 460 |
+
ATOM 438 CB TYR A 58 6.422 6.116 14.878 1.00 0.50 C
|
| 461 |
+
ATOM 439 O TYR A 58 6.065 9.295 15.659 1.00 0.52 O
|
| 462 |
+
ATOM 440 CG TYR A 58 6.176 6.630 13.480 1.00 0.49 C
|
| 463 |
+
ATOM 441 CD1 TYR A 58 4.946 7.177 13.124 1.00 0.47 C
|
| 464 |
+
ATOM 442 CD2 TYR A 58 7.173 6.568 12.512 1.00 0.48 C
|
| 465 |
+
ATOM 443 CE1 TYR A 58 4.714 7.650 11.837 1.00 0.47 C
|
| 466 |
+
ATOM 444 CE2 TYR A 58 6.953 7.038 11.222 1.00 0.48 C
|
| 467 |
+
ATOM 445 OH TYR A 58 5.498 8.043 9.618 1.00 0.44 O
|
| 468 |
+
ATOM 446 CZ TYR A 58 5.722 7.577 10.894 1.00 0.46 C
|
| 469 |
+
ATOM 447 N LEU A 59 5.154 7.928 17.166 1.00 0.50 N
|
| 470 |
+
ATOM 448 CA LEU A 59 4.166 8.868 17.683 1.00 0.50 C
|
| 471 |
+
ATOM 449 C LEU A 59 4.547 9.344 19.081 1.00 0.50 C
|
| 472 |
+
ATOM 450 CB LEU A 59 2.777 8.224 17.711 1.00 0.47 C
|
| 473 |
+
ATOM 451 O LEU A 59 3.832 10.147 19.684 1.00 0.49 O
|
| 474 |
+
ATOM 452 CG LEU A 59 2.130 7.949 16.353 1.00 0.45 C
|
| 475 |
+
ATOM 453 CD1 LEU A 59 0.903 7.059 16.521 1.00 0.42 C
|
| 476 |
+
ATOM 454 CD2 LEU A 59 1.756 9.257 15.663 1.00 0.44 C
|
| 477 |
+
ATOM 455 N ALA A 60 5.718 9.319 19.497 1.00 0.46 N
|
| 478 |
+
ATOM 456 CA ALA A 60 6.022 9.769 20.853 1.00 0.46 C
|
| 479 |
+
ATOM 457 C ALA A 60 7.279 10.633 20.876 1.00 0.46 C
|
| 480 |
+
ATOM 458 CB ALA A 60 6.186 8.572 21.787 1.00 0.42 C
|
| 481 |
+
ATOM 459 O ALA A 60 8.383 10.140 20.637 1.00 0.44 O
|
| 482 |
+
ATOM 460 N GLY A 61 7.357 11.746 20.071 1.00 0.45 N
|
| 483 |
+
ATOM 461 CA GLY A 61 8.144 12.780 20.724 1.00 0.45 C
|
| 484 |
+
ATOM 462 C GLY A 61 7.607 14.179 20.487 1.00 0.46 C
|
| 485 |
+
ATOM 463 O GLY A 61 7.660 14.688 19.366 1.00 0.44 O
|
| 486 |
+
ATOM 464 N GLY A 62 6.464 14.557 21.219 1.00 0.33 N
|
| 487 |
+
ATOM 465 CA GLY A 62 6.288 15.850 21.860 1.00 0.36 C
|
| 488 |
+
ATOM 466 C GLY A 62 7.363 16.852 21.487 1.00 0.33 C
|
| 489 |
+
ATOM 467 O GLY A 62 8.470 16.469 21.102 1.00 0.32 O
|
esm/mcp_output/requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastmcp>=0.1.0
|
| 2 |
+
pydantic>=2.0.0
|
| 3 |
+
requests
|
| 4 |
+
biopython
|
esm/mcp_output/start_mcp.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
MCP Service Startup Entry Point
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
project_root = os.path.dirname(os.path.abspath(__file__))
|
| 9 |
+
mcp_plugin_dir = os.path.join(project_root, "mcp_plugin")
|
| 10 |
+
if mcp_plugin_dir not in sys.path:
|
| 11 |
+
sys.path.insert(0, mcp_plugin_dir)
|
| 12 |
+
|
| 13 |
+
# Set path to point to source directory
|
| 14 |
+
source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "source")
|
| 15 |
+
sys.path.insert(0, source_path)
|
| 16 |
+
|
| 17 |
+
from mcp_service import create_app
|
| 18 |
+
|
| 19 |
+
def main():
|
| 20 |
+
"""Start FastMCP Service"""
|
| 21 |
+
app = create_app()
|
| 22 |
+
# Use environment variable to configure port, default 8000
|
| 23 |
+
port = int(os.environ.get("MCP_PORT", "8000"))
|
| 24 |
+
|
| 25 |
+
# Select transport mode based on environment variable
|
| 26 |
+
transport = os.environ.get("MCP_TRANSPORT", "stdio")
|
| 27 |
+
if transport == "http":
|
| 28 |
+
app.run(transport="http", host="0.0.0.0", port=port)
|
| 29 |
+
else:
|
| 30 |
+
# Default to STDIO mode
|
| 31 |
+
app.run()
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
main()
|
esm/mcp_output/tests_mcp/test_mcp_basic.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MCP Service Basic Tests
|
| 3 |
+
"""
|
| 4 |
+
import sys
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 8 |
+
mcp_plugin_dir = os.path.join(project_root, "mcp_plugin")
|
| 9 |
+
if mcp_plugin_dir not in sys.path:
|
| 10 |
+
sys.path.insert(0, mcp_plugin_dir)
|
| 11 |
+
|
| 12 |
+
source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
|
| 13 |
+
sys.path.insert(0, source_path)
|
| 14 |
+
|
| 15 |
+
def test_import_mcp_service():
|
| 16 |
+
"""Test that the MCP service can be imported correctly"""
|
| 17 |
+
try:
|
| 18 |
+
from mcp_service import create_app
|
| 19 |
+
app = create_app()
|
| 20 |
+
assert app is not None
|
| 21 |
+
print("MCP service imported successfully")
|
| 22 |
+
return True
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f"Failed to import MCP service: {e}")
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
def test_adapter_init():
|
| 28 |
+
"""Test that the adapter can be initialized correctly"""
|
| 29 |
+
try:
|
| 30 |
+
from adapter import Adapter
|
| 31 |
+
adapter = Adapter()
|
| 32 |
+
assert adapter is not None
|
| 33 |
+
print("Adapter initialized successfully")
|
| 34 |
+
return True
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"Failed to initialize adapter: {e}")
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
print("Running MCP service basic tests...")
|
| 41 |
+
test1 = test_import_mcp_service()
|
| 42 |
+
test2 = test_adapter_init()
|
| 43 |
+
|
| 44 |
+
if test1 and test2:
|
| 45 |
+
print("All basic tests passed")
|
| 46 |
+
sys.exit(0)
|
| 47 |
+
else:
|
| 48 |
+
print("Some tests failed")
|
| 49 |
+
sys.exit(1)
|
esm/mcp_output/tests_smoke/test_smoke.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib, sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Add current directory to Python path
|
| 5 |
+
sys.path.insert(0, os.getcwd())
|
| 6 |
+
|
| 7 |
+
source_dir = os.path.join(os.getcwd(), "source")
|
| 8 |
+
if os.path.exists(source_dir):
|
| 9 |
+
sys.path.insert(0, source_dir)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
importlib.import_module("esm")
|
| 14 |
+
print("OK - Successfully imported esm")
|
| 15 |
+
except ImportError as e:
|
| 16 |
+
print(f"Failed to import esm: {e}")
|
| 17 |
+
fallback_packages = []
|
| 18 |
+
|
| 19 |
+
fallback_packages = ['esm']
|
| 20 |
+
|
| 21 |
+
for pkg in fallback_packages:
|
| 22 |
+
try:
|
| 23 |
+
importlib.import_module(pkg)
|
| 24 |
+
print(f"OK - Successfully imported {pkg}")
|
| 25 |
+
break
|
| 26 |
+
except ImportError:
|
| 27 |
+
continue
|
| 28 |
+
else:
|
| 29 |
+
print("All import attempts failed")
|
esm/source/.flake8
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[flake8]
|
| 2 |
+
max-line-length = 99
|
| 3 |
+
ignore = E203,W503
|
| 4 |
+
exclude =
|
| 5 |
+
.git,
|
| 6 |
+
__pycache__,
|
| 7 |
+
build,
|
| 8 |
+
dist,
|
| 9 |
+
experimental
|
| 10 |
+
third_party
|
esm/source/.git-blame-ignore-revs
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Migrate code style to Black
|
| 2 |
+
8bc7e948cd9bf0b6d1f2113e221ef548ef663377
|
esm/source/.github/ISSUE_TEMPLATE/bug.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: "[Bug Report]"
|
| 3 |
+
about: "Create a bug report. For other questions: see Discussions tab."
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
NOTE: if this is not a bug report, please use the [GitHub Discussions](https://github.com/facebookresearch/esm/discussions) for support questions (How do I do X?), feature requests, ideas, showcasing new applications, etc.
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
**Bug description**
|
| 11 |
+
Please enter a clear and concise description of what the bug is.
|
| 12 |
+
|
| 13 |
+
**Reproduction steps**
|
| 14 |
+
Enter steps to reproduce the behavior.
|
| 15 |
+
|
| 16 |
+
**Expected behavior**
|
| 17 |
+
Give a clear and concise description of what you expected to happen.
|
| 18 |
+
|
| 19 |
+
**Logs**
|
| 20 |
+
Please paste the command line output:
|
| 21 |
+
|
| 22 |
+
```
|
| 23 |
+
Output goes here
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
**Additional context**
|
| 27 |
+
Add any other context about the problem here. (like proxy settings, network setup, overall goals, etc.)
|
esm/source/.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tensor dumps
|
| 2 |
+
*.pt
|
| 3 |
+
# Compiler Output #
|
| 4 |
+
###################
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*.so
|
| 7 |
+
*.o
|
| 8 |
+
*.exe
|
| 9 |
+
*.class
|
| 10 |
+
|
| 11 |
+
# Folders #
|
| 12 |
+
###########
|
| 13 |
+
bin/
|
| 14 |
+
build/
|
| 15 |
+
dist/
|
| 16 |
+
local/
|
| 17 |
+
tmp/
|
| 18 |
+
__pycache__/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.idea/
|
| 21 |
+
.ipynb_checkpoints/
|
| 22 |
+
.vscode/
|
| 23 |
+
esm/dev
|
| 24 |
+
|
| 25 |
+
# Junk #
|
| 26 |
+
########
|
| 27 |
+
.DS_Store*
|
| 28 |
+
.*.swp
|
| 29 |
+
*.swp
|
| 30 |
+
*.log
|
| 31 |
+
*~
|
esm/source/CODE_OF_CONDUCT.rst
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Code of Conduct
|
| 2 |
+
===============
|
| 3 |
+
|
| 4 |
+
Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please `read the full text`__ so that you can understand what actions will and will not be tolerated.
|
| 5 |
+
|
| 6 |
+
__ https://code.facebook.com/codeofconduct
|
esm/source/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to esm
|
| 2 |
+
We want to make contributing to this project as easy and transparent as
|
| 3 |
+
possible.
|
| 4 |
+
|
| 5 |
+
## Pull Requests
|
| 6 |
+
We actively welcome your pull requests.
|
| 7 |
+
|
| 8 |
+
1. Fork the repo and create your branch from `master`.
|
| 9 |
+
2. If you've added code that should be tested, add tests.
|
| 10 |
+
3. If you've changed APIs, update the documentation.
|
| 11 |
+
4. Ensure the test suite passes.
|
| 12 |
+
5. Make sure your code lints.
|
| 13 |
+
6. If you haven't already, complete the Contributor License Agreement ("CLA").
|
| 14 |
+
|
| 15 |
+
## Contributor License Agreement ("CLA")
|
| 16 |
+
In order to accept your pull request, we need you to submit a CLA. You only need
|
| 17 |
+
to do this once to work on any of Facebook's open source projects.
|
| 18 |
+
|
| 19 |
+
Complete your CLA here: <https://code.facebook.com/cla>
|
| 20 |
+
|
| 21 |
+
## Issues
|
| 22 |
+
We use GitHub issues to track public bugs. Please ensure your description is
|
| 23 |
+
clear and has sufficient instructions to be able to reproduce the issue.
|
| 24 |
+
|
| 25 |
+
Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
|
| 26 |
+
disclosure of security bugs. In those cases, please go through the process
|
| 27 |
+
outlined on that page and do not file a public issue.
|
| 28 |
+
|
| 29 |
+
## License
|
| 30 |
+
By contributing to icp-block-mdp, you agree that your contributions will be licensed
|
| 31 |
+
under the LICENSE file in the root directory of this source tree.
|
esm/source/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
esm/source/README.md
ADDED
|
@@ -0,0 +1,795 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Evolutionary Scale Modeling
|
| 2 |
+
|
| 3 |
+
[](https://esmatlas.com)
|
| 4 |
+
|
| 5 |
+
***Update April 2023:*** Code for the two simultaneous preprints on protein design is now released! Code for "Language models generalize beyond natural proteins" is under [examples/lm-design/](examples/lm-design/). Code for "A high-level programming language for generative protein design" is under [examples/protein-programming-language/](examples/protein-programming-language/).
|
| 6 |
+
|
| 7 |
+
This repository contains code and pre-trained weights for **Transformer protein language models** from the Meta Fundamental AI Research Protein Team (FAIR), including our state-of-the-art [**ESM-2** and **ESMFold**](#esmfold), as well as [**MSA Transformer**](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1), [**ESM-1v**](#zs_variant) for predicting variant effects and [**ESM-IF1**](#invf) for inverse folding.
|
| 8 |
+
Transformer protein language models were introduced in the [2019 preprint](https://doi.org/10.1101/622803) of the paper ["Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences"](https://doi.org/10.1073/pnas.2016239118).
|
| 9 |
+
ESM-2 outperforms all tested single-sequence protein language models across a range of structure prediction tasks.
|
| 10 |
+
ESMFold harnesses the ESM-2 language model to generate accurate structure predictions end to end directly from the sequence of a protein.
|
| 11 |
+
|
| 12 |
+
In November 2022, we released `v0` of the [ESM Metagenomic Atlas](https://esmatlas.com), an open atlas of 617 million predicted metagenomic protein structures.
|
| 13 |
+
The Atlas was updated in March 2023 in collaboration with EBI. The new `v2023_02` adds another 150 million predicted structures to the Atlas, as well as pre-computed ESM2 embeddings.
|
| 14 |
+
Bulk download, blog post and the resources provided on the Atlas website are documented [on this README](#atlas).
|
| 15 |
+
|
| 16 |
+
In December 2022, we released two simultaneous preprints on protein design.
|
| 17 |
+
* "Language models generalize beyond natural proteins" ([PAPER](https://doi.org/10.1101/2022.12.21.521521), [CODE](examples/lm-design/)) uses ESM2 to design de novo proteins. The code and data associated with the preprint can be found [here](examples/lm-design/).
|
| 18 |
+
* "A high-level programming language for generative protein design" ([PAPER](https://doi.org/10.1101/2022.12.21.521526), [CODE](examples/protein-programming-language/)) uses ESMFold to design proteins according to a high-level programming language.
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
<details><summary><b>Citation</b></summary>
|
| 23 |
+
For ESM2, ESMFold and ESM Atlas:
|
| 24 |
+
```bibtex
|
| 25 |
+
@article{lin2023evolutionary,
|
| 26 |
+
title = {Evolutionary-scale prediction of atomic-level protein structure with a language model},
|
| 27 |
+
author = {Zeming Lin and Halil Akin and Roshan Rao and Brian Hie and Zhongkai Zhu and Wenting Lu and Nikita Smetanin and Robert Verkuil and Ori Kabeli and Yaniv Shmueli and Allan dos Santos Costa and Maryam Fazel-Zarandi and Tom Sercu and Salvatore Candido and Alexander Rives },
|
| 28 |
+
journal = {Science},
|
| 29 |
+
volume = {379},
|
| 30 |
+
number = {6637},
|
| 31 |
+
pages = {1123-1130},
|
| 32 |
+
year = {2023},
|
| 33 |
+
doi = {10.1126/science.ade2574},
|
| 34 |
+
URL = {https://www.science.org/doi/abs/10.1126/science.ade2574},
|
| 35 |
+
note={Earlier versions as preprint: bioRxiv 2022.07.20.500902},
|
| 36 |
+
}
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
For transformer protein language models:
|
| 40 |
+
```bibtex
|
| 41 |
+
@article{rives2021biological,
|
| 42 |
+
title={Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences},
|
| 43 |
+
author={Rives, Alexander and Meier, Joshua and Sercu, Tom and Goyal, Siddharth and Lin, Zeming and Liu, Jason and Guo, Demi and Ott, Myle and Zitnick, C Lawrence and Ma, Jerry and others},
|
| 44 |
+
journal={Proceedings of the National Academy of Sciences},
|
| 45 |
+
volume={118},
|
| 46 |
+
number={15},
|
| 47 |
+
pages={e2016239118},
|
| 48 |
+
year={2021},
|
| 49 |
+
publisher={National Acad Sciences},
|
| 50 |
+
note={bioRxiv 10.1101/622803},
|
| 51 |
+
doi={10.1073/pnas.2016239118},
|
| 52 |
+
url={https://www.pnas.org/doi/full/10.1073/pnas.2016239118},
|
| 53 |
+
}
|
| 54 |
+
```
|
| 55 |
+
</details>
|
| 56 |
+
|
| 57 |
+
<details open><summary><b>Table of contents</b></summary>
|
| 58 |
+
|
| 59 |
+
- [Main models you should use](#main-models)
|
| 60 |
+
- [Usage](#usage)
|
| 61 |
+
- [Quick Start](#quickstart)
|
| 62 |
+
- [Getting Started with this repository](#repostart)
|
| 63 |
+
- [ESMFold Structure Prediction](#esmfold)
|
| 64 |
+
- [Compute embeddings in bulk from FASTA](#bulk_fasta)
|
| 65 |
+
- [CPU offloading for inference with large models](#fsdp)
|
| 66 |
+
- [Zero-shot variant prediction](#zs_variant)
|
| 67 |
+
- [Inverse folding](#invf)
|
| 68 |
+
- [ESM Metagenomic Atlas](#atlas)
|
| 69 |
+
- [Notebooks](#notebooks)
|
| 70 |
+
- [Available Models and Datasets](#available)
|
| 71 |
+
- [Pre-trained Models](#available-models)
|
| 72 |
+
- [ESM Structural Split Dataset](#available-esmssd)
|
| 73 |
+
- [Pre-training Dataset Split](#available-pretraining-split)
|
| 74 |
+
- [Comparison to related works](#perf_related)
|
| 75 |
+
- [Citations](#citations)
|
| 76 |
+
- [License](#license)
|
| 77 |
+
</details>
|
| 78 |
+
|
| 79 |
+
<details><summary><b>What's New</b></summary>
|
| 80 |
+
|
| 81 |
+
- April 2023: Code for the protein design preprints released under [examples/lm-design/](examples/lm-design/).
|
| 82 |
+
- March 2023: We release an update to the ESM Metagenomic Atlas, `v2023_02`. See [website](https://esmatlas.com/) and [bulk download details](#atlas).
|
| 83 |
+
- December 2022: The Meta Fundamental AI Research Protein Team (FAIR) released two simultaneous preprints on protein design:
|
| 84 |
+
["Language models generalize beyond natural proteins" (Verkuil, Kabeli, et al., 2022)](https://doi.org/10.1101/2022.12.21.521521), and ["A high-level programming language for generative protein design" (Hie, Candido, et al., 2022)](https://doi.org/10.1101/2022.12.21.521521).
|
| 85 |
+
- November 2022: ESM Metagenomic Atlas, a repository of 600M+ metagenomics structures released, see [website](https://esmatlas.com/) and [bulk download details](#atlas)
|
| 86 |
+
- November 2022: ESMFold - new end-to-end structure prediction model released (see [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574))
|
| 87 |
+
- August 2022: ESM-2 - new SOTA Language Models released (see [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574))
|
| 88 |
+
- April 2022: New inverse folding model ESM-IF1 released, trained on CATH and UniRef50 predicted structures.
|
| 89 |
+
- August 2021: Added flexibility to tokenizer to allow for spaces and special tokens (like `<mask>`) in sequence.
|
| 90 |
+
- July 2021: New pre-trained model ESM-1v released, trained on UniRef90 (see [Meier et al. 2021](https://doi.org/10.1101/2021.07.09.450648)).
|
| 91 |
+
- July 2021: New MSA Transformer released, with a minor fix in the row positional embeddings (`ESM-MSA-1b`).
|
| 92 |
+
- Feb 2021: MSA Transformer added (see [Rao et al. 2021](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1)). Example usage in [notebook](#notebooks).
|
| 93 |
+
- Dec 2020: [Self-Attention Contacts](#notebooks) for all pre-trained models (see [Rao et al. 2020](https://doi.org/10.1101/2020.12.15.422761))
|
| 94 |
+
- Dec 2020: Added new pre-trained model [ESM-1b](#perf_related) (see [Rives et al. 2019](https://doi.org/10.1101/622803) Appendix B)
|
| 95 |
+
- Dec 2020: [ESM Structural Split Dataset](#available-esmssd) (see [Rives et al. 2019](https://doi.org/10.1101/622803) Appendix A.10)
|
| 96 |
+
|
| 97 |
+
</details>
|
| 98 |
+
|
| 99 |
+
## Main models you should use <a name="main-models"></a>
|
| 100 |
+
|
| 101 |
+
| Shorthand | `esm.pretrained.` | Dataset | Description |
|
| 102 |
+
|-----------|-----------------------------|---------|--------------|
|
| 103 |
+
| ESM-2 | `esm2_t36_3B_UR50D()` `esm2_t48_15B_UR50D()` | UR50 (sample UR90) | SOTA general-purpose protein language model. Can be used to predict structure, function and other protein properties directly from individual sequences. Released with [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574) (Aug 2022 update). |
|
| 104 |
+
| ESMFold | `esmfold_v1()` | PDB + UR50 | End-to-end single sequence 3D structure predictor (Nov 2022 update). |
|
| 105 |
+
| ESM-MSA-1b| `esm_msa1b_t12_100M_UR50S()` | UR50 + MSA | MSA Transformer language model. Can be used to extract embeddings from an MSA. Enables SOTA inference of structure. Released with [Rao et al. 2021](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v2) (ICML'21 version, June 2021). |
|
| 106 |
+
| ESM-1v | `esm1v_t33_650M_UR90S_1()` ... `esm1v_t33_650M_UR90S_5()`| UR90 | Language model specialized for prediction of variant effects. Enables SOTA zero-shot prediction of the functional effects of sequence variations. Same architecture as ESM-1b, but trained on UniRef90. Released with [Meier et al. 2021](https://doi.org/10.1101/2021.07.09.450648). |
|
| 107 |
+
| ESM-IF1 | `esm_if1_gvp4_t16_142M_UR50()` | CATH + UR50 | Inverse folding model. Can be used to design sequences for given structures, or to predict functional effects of sequence variation for given structures. Enables SOTA fixed backbone sequence design. Released with [Hsu et al. 2022](https://doi.org/10.1101/2022.04.10.487779). |
|
| 108 |
+
|
| 109 |
+
For a complete list of available models, with details and release notes, see [Pre-trained Models](#available-models).
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
## Usage <a name="usage"></a>
|
| 113 |
+
|
| 114 |
+
### Quick start <a name="quickstart"></a>
|
| 115 |
+
|
| 116 |
+
An easy way to get started is to load ESM or ESMFold through the [HuggingFace transformers library](https://huggingface.co/docs/transformers/model_doc/esm),
|
| 117 |
+
which has simplified the ESMFold dependencies and provides a standardized API and tools to work with state-of-the-art pretrained models.
|
| 118 |
+
|
| 119 |
+
Alternatively, [ColabFold](https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/ESMFold.ipynb) has integrated ESMFold so that you can
|
| 120 |
+
easily run it directly in the browser on a Google Colab instance.
|
| 121 |
+
|
| 122 |
+
We also provide an API which you can access through curl or on [the ESM Metagenomic Atlas web page](https://esmatlas.com/resources?action=fold).
|
| 123 |
+
```
|
| 124 |
+
curl -X POST --data "KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL" https://api.esmatlas.com/foldSequence/v1/pdb/
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
For ESM-MSA-1b, ESM-IF1, or any of the other models you can use the original implementation from our repo directly via the instructions below.
|
| 128 |
+
|
| 129 |
+
### Getting started with this repo <a name="repostart"></a>
|
| 130 |
+
|
| 131 |
+
As a prerequisite, you must have PyTorch installed to use this repository.
|
| 132 |
+
|
| 133 |
+
You can use this one-liner for installation, using the latest release of esm:
|
| 134 |
+
|
| 135 |
+
```bash
|
| 136 |
+
pip install fair-esm # latest release, OR:
|
| 137 |
+
pip install git+https://github.com/facebookresearch/esm.git # bleeding edge, current repo main branch
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
To use the ESMFold model, make sure you start from an environment with python <= 3.9 and pytorch installed.
|
| 141 |
+
Then add the `[esmfold]` option to your pip install, which will install the dependencies for OpenFold
|
| 142 |
+
automatically. Openfold installation requires `nvcc`.
|
| 143 |
+
|
| 144 |
+
```bash
|
| 145 |
+
pip install "fair-esm[esmfold]"
|
| 146 |
+
# OpenFold and its remaining dependency
|
| 147 |
+
pip install 'dllogger @ git+https://github.com/NVIDIA/dllogger.git'
|
| 148 |
+
pip install 'openfold @ git+https://github.com/aqlaboratory/openfold.git@4b41059694619831a7db195b7e0988fc4ff3a307'
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
**NOTE**: If openfold installation fails, please double check that `nvcc` is available and that a cuda-compatable version of PyTorch has been installed.
|
| 152 |
+
|
| 153 |
+
Alternatively, we provide the `esmfold` conda environment, which can be built via `conda env create -f environment.yml`.
|
| 154 |
+
|
| 155 |
+
We also support PyTorch Hub, which removes the need to clone and/or install this repository yourself:
|
| 156 |
+
|
| 157 |
+
```python
|
| 158 |
+
import torch
|
| 159 |
+
model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
After pip install, you can load and use a pretrained model as follows:
|
| 163 |
+
|
| 164 |
+
```python
|
| 165 |
+
import torch
|
| 166 |
+
import esm
|
| 167 |
+
|
| 168 |
+
# Load ESM-2 model
|
| 169 |
+
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
|
| 170 |
+
batch_converter = alphabet.get_batch_converter()
|
| 171 |
+
model.eval() # disables dropout for deterministic results
|
| 172 |
+
|
| 173 |
+
# Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
|
| 174 |
+
data = [
|
| 175 |
+
("protein1", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"),
|
| 176 |
+
("protein2", "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
|
| 177 |
+
("protein2 with mask","KALTARQQEVFDLIRD<mask>ISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
|
| 178 |
+
("protein3", "K A <mask> I S Q"),
|
| 179 |
+
]
|
| 180 |
+
batch_labels, batch_strs, batch_tokens = batch_converter(data)
|
| 181 |
+
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
|
| 182 |
+
|
| 183 |
+
# Extract per-residue representations (on CPU)
|
| 184 |
+
with torch.no_grad():
|
| 185 |
+
results = model(batch_tokens, repr_layers=[33], return_contacts=True)
|
| 186 |
+
token_representations = results["representations"][33]
|
| 187 |
+
|
| 188 |
+
# Generate per-sequence representations via averaging
|
| 189 |
+
# NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
|
| 190 |
+
sequence_representations = []
|
| 191 |
+
for i, tokens_len in enumerate(batch_lens):
|
| 192 |
+
sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
|
| 193 |
+
|
| 194 |
+
# Look at the unsupervised self-attention map contact predictions
|
| 195 |
+
import matplotlib.pyplot as plt
|
| 196 |
+
for (_, seq), tokens_len, attention_contacts in zip(data, batch_lens, results["contacts"]):
|
| 197 |
+
plt.matshow(attention_contacts[: tokens_len, : tokens_len])
|
| 198 |
+
plt.title(seq)
|
| 199 |
+
plt.show()
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
### ESMFold Structure Prediction <a name="esmfold"></a>
|
| 204 |
+
|
| 205 |
+
After installing with the `[esmfold]` option, you can use the ESMFold structure prediction model as follows:
|
| 206 |
+
|
| 207 |
+
```python
|
| 208 |
+
import torch
|
| 209 |
+
import esm
|
| 210 |
+
|
| 211 |
+
model = esm.pretrained.esmfold_v1()
|
| 212 |
+
model = model.eval().cuda()
|
| 213 |
+
|
| 214 |
+
# Optionally, uncomment to set a chunk size for axial attention. This can help reduce memory.
|
| 215 |
+
# Lower sizes will have lower memory requirements at the cost of increased speed.
|
| 216 |
+
# model.set_chunk_size(128)
|
| 217 |
+
|
| 218 |
+
sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
|
| 219 |
+
# Multimer prediction can be done with chains separated by ':'
|
| 220 |
+
|
| 221 |
+
with torch.no_grad():
|
| 222 |
+
output = model.infer_pdb(sequence)
|
| 223 |
+
|
| 224 |
+
with open("result.pdb", "w") as f:
|
| 225 |
+
f.write(output)
|
| 226 |
+
|
| 227 |
+
import biotite.structure.io as bsio
|
| 228 |
+
struct = bsio.load_structure("result.pdb", extra_fields=["b_factor"])
|
| 229 |
+
print(struct.b_factor.mean()) # this will be the pLDDT
|
| 230 |
+
# 88.3
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
Besides `esm.pretrained.esmfold_v1()` which is the best performing model we recommend using, we
|
| 235 |
+
also provide `esm.pretrained.esmfold_v0()` which was used for the experiments in
|
| 236 |
+
[Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574).
|
| 237 |
+
|
| 238 |
+
We also provide a command line interface (`esm-fold`) that efficiently predicts structures in bulk from a FASTA file using ESMFold:
|
| 239 |
+
```
|
| 240 |
+
usage: esm-fold [-h] -i FASTA -o PDB [--num-recycles NUM_RECYCLES]
|
| 241 |
+
[--max-tokens-per-batch MAX_TOKENS_PER_BATCH]
|
| 242 |
+
[--chunk-size CHUNK_SIZE] [--cpu-only] [--cpu-offload]
|
| 243 |
+
|
| 244 |
+
optional arguments:
|
| 245 |
+
-h, --help show this help message and exit
|
| 246 |
+
-i FASTA, --fasta FASTA
|
| 247 |
+
Path to input FASTA file
|
| 248 |
+
-o PDB, --pdb PDB Path to output PDB directory
|
| 249 |
+
--num-recycles NUM_RECYCLES
|
| 250 |
+
Number of recycles to run. Defaults to number used in
|
| 251 |
+
training (4).
|
| 252 |
+
--max-tokens-per-batch MAX_TOKENS_PER_BATCH
|
| 253 |
+
Maximum number of tokens per gpu forward-pass. This
|
| 254 |
+
will group shorter sequences together for batched
|
| 255 |
+
prediction. Lowering this can help with out of memory
|
| 256 |
+
issues, if these occur on short sequences.
|
| 257 |
+
--chunk-size CHUNK_SIZE
|
| 258 |
+
Chunks axial attention computation to reduce memory
|
| 259 |
+
usage from O(L^2) to O(L). Equivalent to running a for
|
| 260 |
+
loop over chunks of of each dimension. Lower values
|
| 261 |
+
will result in lower memory usage at the cost of
|
| 262 |
+
speed. Recommended values: 128, 64, 32. Default: None.
|
| 263 |
+
--cpu-only CPU only
|
| 264 |
+
--cpu-offload Enable CPU offloading
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
The command will make one prediction for every sequence in the fasta file. Multimers can be predicted and should be entered in the fasta file as a single sequence, with chains seprated by a ":" character.
|
| 268 |
+
|
| 269 |
+
By default, predictions will be batched together so that shorter sequences are predicted simultaneously. This can be disabled by setting `--max-tokens-per-batch=0`. Batching can significantly improve prediction speed on shorter sequences.
|
| 270 |
+
|
| 271 |
+
The `--cpu-offload` flag can be useful for making predictions on longer sequences. It will attempt to offload some parameters to the CPU RAM, rather than storing on GPU.
|
| 272 |
+
|
| 273 |
+
Finally, the ablation experiments for LMs of varying sizes [Lin et al. 2022 table S1](https://www.science.org/doi/abs/10.1126/science.ade2574) are released as `esm.pretrained.esmfold_structure_module_only_*()`. We don't recommend using these models for structure prediction.
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
### Compute embeddings in bulk from FASTA <a name="bulk_fasta"></a>
|
| 277 |
+
|
| 278 |
+
We provide a command line interface (`esm-extract`) that efficiently extracts embeddings in bulk for a FASTA file from the ESM:
|
| 279 |
+
```
|
| 280 |
+
usage: esm-extract [-h] [--toks_per_batch TOKS_PER_BATCH]
|
| 281 |
+
[--repr_layers REPR_LAYERS [REPR_LAYERS ...]] --include
|
| 282 |
+
{mean,per_tok,bos,contacts}
|
| 283 |
+
[{mean,per_tok,bos,contacts} ...]
|
| 284 |
+
[--truncation_seq_length TRUNCATION_SEQ_LENGTH]
|
| 285 |
+
model_location fasta_file output_dir
|
| 286 |
+
|
| 287 |
+
Extract per-token representations and model outputs for sequences in a FASTA
|
| 288 |
+
file
|
| 289 |
+
|
| 290 |
+
positional arguments:
|
| 291 |
+
model_location PyTorch model file OR name of pretrained model to
|
| 292 |
+
download (see README for models)
|
| 293 |
+
fasta_file FASTA file on which to extract representations
|
| 294 |
+
output_dir output directory for extracted representations
|
| 295 |
+
|
| 296 |
+
optional arguments:
|
| 297 |
+
-h, --help show this help message and exit
|
| 298 |
+
--toks_per_batch TOKS_PER_BATCH
|
| 299 |
+
maximum batch size
|
| 300 |
+
--repr_layers REPR_LAYERS [REPR_LAYERS ...]
|
| 301 |
+
layers indices from which to extract representations
|
| 302 |
+
(0 to num_layers, inclusive)
|
| 303 |
+
--include {mean,per_tok,bos,contacts} [{mean,per_tok,bos,contacts} ...]
|
| 304 |
+
specify which representations to return
|
| 305 |
+
--truncation_seq_length TRUNCATION_SEQ_LENGTH
|
| 306 |
+
truncate sequences longer than the given value
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
The following commands allow the extraction of the final-layer embedding for a FASTA file from the ESM-2 model:
|
| 310 |
+
|
| 311 |
+
```bash
|
| 312 |
+
esm-extract esm2_t33_650M_UR50D examples/data/some_proteins.fasta \
|
| 313 |
+
examples/data/some_proteins_emb_esm2 --repr_layers 0 32 33 --include
|
| 314 |
+
```
|
| 315 |
+
```bash
|
| 316 |
+
python scripts/extract.py esm2_t33_650M_UR50D examples/data/some_proteins.fasta \
|
| 317 |
+
examples/data/some_proteins_emb_esm2 --repr_layers 0 32 33 --include mean per_tok
|
| 318 |
+
```
|
| 319 |
+
|
| 320 |
+
A cuda device is optional and will be auto-detected.
|
| 321 |
+
|
| 322 |
+
Directory `some_proteins_emb_esm2/` now contains one `.pt` file per FASTA sequence; use `torch.load()` to load them.
|
| 323 |
+
`scripts/extract.py` has flags that determine what's included in the `.pt` file:
|
| 324 |
+
* `--repr-layers` (default: final only) selects which layers to include embeddings from.
|
| 325 |
+
* `--include` specifies what embeddings to save. You can use the following:
|
| 326 |
+
* `per_tok` includes the full sequence, with an embedding per amino acid (seq_len x hidden_dim).
|
| 327 |
+
* `mean` includes the embeddings averaged over the full sequence, per layer.
|
| 328 |
+
* `bos` includes the embeddings from the beginning-of-sequence token.
|
| 329 |
+
(NOTE: Don't use with the pre-trained models - we trained without bos-token supervision)
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
### CPU offloading for inference with large models <a name="fsdp"></a>
|
| 333 |
+
If you want to load very large models like 15B and/or do inference on long sequences on your machine, regular GPU inference may lead to OOM errors.
|
| 334 |
+
We show how to load the model with Fairscale's [Fully Sharded Data Parallel (FSDP)](https://fairscale.readthedocs.io/en/stable/api/nn/fsdp.html) and
|
| 335 |
+
use its CPU offloading feature.
|
| 336 |
+
This allows to do inference of large models on a single GPU.
|
| 337 |
+
Please check out `examples/esm2_infer_fairscale_fsdp_cpu_offloading.py` for more details.
|
| 338 |
+
|
| 339 |
+
### Zero-shot variant prediction <a name="zs_variant"></a>
|
| 340 |
+
See "[examples/variant-prediction/](examples/variant-prediction/)" for code and pre-trained weights for the ESM-1v models described in
|
| 341 |
+
[Language models enable zero-shot prediction of the effects of mutations on protein function. (Meier et al. 2021)](https://doi.org/10.1101/2021.07.09.450648).
|
| 342 |
+
|
| 343 |
+
Note that ESM-2 could be used for variant prediction as well, and is expected to have similar performance to ESM-1v.
|
| 344 |
+
|
| 345 |
+
### Inverse folding <a name="invf"></a>
|
| 346 |
+
See "[examples/inverse_folding/](examples/inverse_folding/)" for detailed user guide. The ESM-IF1 model is described as `GVPTransformer` in [Learning inverse folding from millions of predicted structures. (Hsu et al. 2022)](https://doi.org/10.1101/2022.04.10.487779).
|
| 347 |
+
|
| 348 |
+
We also provide a colab notebook for the sequence design and sequence scoring functionalities.
|
| 349 |
+
|
| 350 |
+
[<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/inverse_folding/notebook_multichain.ipynb)
|
| 351 |
+
|
| 352 |
+
The ESM-IF1 inverse folding model is built for predicting protein sequences
|
| 353 |
+
from their backbone atom coordinates. We provide scripts here 1) to sample sequence
|
| 354 |
+
designs for a given structure and 2) to score sequences for a given structure.
|
| 355 |
+
|
| 356 |
+
Trained with 12M protein structures predicted by AlphaFold2, the ESM-IF1
|
| 357 |
+
model consists of invariant geometric input processing layers followed by a
|
| 358 |
+
sequence-to-sequence transformer, and achieves 51% native sequence recovery on
|
| 359 |
+
structurally held-out backbones with 72% recovery for buried residues.
|
| 360 |
+
The model is also trained with span masking to tolerate missing backbone
|
| 361 |
+
coordinates and therefore can predict sequences for partially masked structures.
|
| 362 |
+
|
| 363 |
+
#### Sample sequence designs for a given structure
|
| 364 |
+
The environment setup is described in [this subsection of examples/inverse_folding](examples/inverse_folding#recommended-environment).
|
| 365 |
+
|
| 366 |
+
To sample sequences for a given structure in PDB or mmCIF format, use the
|
| 367 |
+
`sample_sequences.py` script. The input file can have either `.pdb` or
|
| 368 |
+
`.cif` as suffix.
|
| 369 |
+
|
| 370 |
+
For example, to sample 3 sequence designs for the golgi casein kinase structure
|
| 371 |
+
(PDB [5YH2](https://www.rcsb.org/structure/5yh2); [PDB Molecule of the Month
|
| 372 |
+
from January 2022](https://pdb101.rcsb.org/motm/265)), we can run the following
|
| 373 |
+
command from the esm root directory:
|
| 374 |
+
```bash
|
| 375 |
+
python examples/inverse_folding/sample_sequences.py examples/inverse_folding/data/5YH2.pdb \
|
| 376 |
+
--chain C --temperature 1 --num-samples 3 --outpath examples/inverse_folding/output/sampled_sequences.fasta
|
| 377 |
+
```
|
| 378 |
+
|
| 379 |
+
The sampled sequences will be saved in a fasta format to the specified output file.
|
| 380 |
+
|
| 381 |
+
The temperature parameter controls the sharpness of the probability
|
| 382 |
+
distribution for sequence sampling. Higher sampling temperatures yield more
|
| 383 |
+
diverse sequences but likely with lower native sequence recovery.
|
| 384 |
+
The default sampling temperature is 1. To optimize for native sequence
|
| 385 |
+
recovery, we recommend sampling with low temperature such as 1e-6.
|
| 386 |
+
|
| 387 |
+
#### Scoring sequences
|
| 388 |
+
To score the conditional log-likelihoods for sequences conditioned on a given
|
| 389 |
+
structure, use the `score_log_likelihoods.py` script.
|
| 390 |
+
|
| 391 |
+
For example, to score the sequences in `examples/inverse_folding/data/5YH2_mutated_seqs.fasta`
|
| 392 |
+
according to the structure in `examples/inverse_folding/data/5YH2.pdb`, we can run
|
| 393 |
+
the following command from the esm root directory:
|
| 394 |
+
```
|
| 395 |
+
python examples/inverse_folding/score_log_likelihoods.py examples/inverse_folding/data/5YH2.pdb \
|
| 396 |
+
examples/inverse_folding/data/5YH2_mutated_seqs.fasta --chain C \
|
| 397 |
+
--outpath examples/inverse_folding/output/5YH2_mutated_seqs_scores.csv
|
| 398 |
+
```
|
| 399 |
+
|
| 400 |
+
The conditional log-likelihoods are saved in a csv format in the specified output path.
|
| 401 |
+
The output values are the average log-likelihoods averaged over all amino acids in a sequence.
|
| 402 |
+
|
| 403 |
+
For more information, see "[./examples/inverse_folding/](examples/inverse_folding/)" for detailed user guide.
|
| 404 |
+
|
| 405 |
+
## ESM Metagenomic Atlas <a name="atlas"></a>
|
| 406 |
+
|
| 407 |
+
Please visit the [ESM Metagenomic Atlas](https://esmatlas.com/) website, and
|
| 408 |
+
see our [blog post](https://ai.facebook.com/blog/protein-folding-esmfold-metagenomics/) to learn more.
|
| 409 |
+
|
| 410 |
+
Bulk download instructions available at a seperate README [here](scripts/atlas/README.md).
|
| 411 |
+
|
| 412 |
+
The Atlas resources include a page to [fold a sequence using ESMFold](https://esmatlas.com/resources?action=fold),
|
| 413 |
+
searching a subset of the ESM Atlas by [structure](https://esmatlas.com/resources?action=search_structure) or
|
| 414 |
+
[sequence](https://esmatlas.com/resources?action=search_sequence),
|
| 415 |
+
as well as an [API](https://esmatlas.com/about#api) to access those resources programmatically.
|
| 416 |
+
|
| 417 |
+
Foldseek provides search against the Atlas without the length limitation [here](https://search.foldseek.com/search).
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
## Notebooks <a name="notebooks"></a>
|
| 421 |
+
|
| 422 |
+
### Inverse folding - predicting or scoring sequences based on backbone structures
|
| 423 |
+
|
| 424 |
+
[<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/inverse_folding/notebook.ipynb)
|
| 425 |
+
|
| 426 |
+
The ESM-IF1 inverse folding model predicts protein sequences from their backbone atom coordinates, trained with 12M protein structures predicted by AlphaFold2.
|
| 427 |
+
This notetook guide you through examples of sampling sequences, calculating conditional log-likelihoods, and extracting encoder output as structure representation.
|
| 428 |
+
|
| 429 |
+
### Supervised variant prediction - training a classifier on the embeddings
|
| 430 |
+
|
| 431 |
+
[<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/sup_variant_prediction.ipynb)
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
To help you get started with using the embeddings, this [jupyter notebook tutorial](examples/sup_variant_prediction.ipynb) shows how to train a supervised variant predictor using embeddings from ESM-1.
|
| 435 |
+
You can adopt a similar protocol to train a model for any downstream task, even with limited data.
|
| 436 |
+
First you can obtain the embeddings for ``examples/data/P62593.fasta`` either by [downloading the precomputed](https://dl.fbaipublicfiles.com/fair-esm/examples/P62593_reprs.tar.gz) embeddings
|
| 437 |
+
as instructed in the notebook or by running the following:
|
| 438 |
+
|
| 439 |
+
```bash
|
| 440 |
+
# Obtain the embeddings
|
| 441 |
+
python scripts/extract.py esm1v_t33_650M_UR90S_1 examples/data/P62593.fasta \
|
| 442 |
+
examples/data/P62593_emb_esm1v --repr_layers 33 --include mean
|
| 443 |
+
```
|
| 444 |
+
|
| 445 |
+
Then, follow the remaining instructions in the tutorial. You can also run the tutorial in a [colab notebook](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/sup_variant_prediction.ipynb).
|
| 446 |
+
|
| 447 |
+
**Note, alternatively use [the newer instructions for zero-shot variant prediction](examples/variant-prediction/),
|
| 448 |
+
which predicts mutational effects without any supervised training.**
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
### Unsupervised contact prediction
|
| 452 |
+
[<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/contact_prediction.ipynb)
|
| 453 |
+
|
| 454 |
+
This [jupyter notebook tutorial](examples/contact_prediction.ipynb) demonstrates contact prediction with both the ESM-2 and MSA Transformer (ESM-MSA-1) models.
|
| 455 |
+
Contact prediction is based on a logistic regression over the model's attention maps.
|
| 456 |
+
This methodology is based on our ICLR 2021 paper,
|
| 457 |
+
[Transformer protein language models are unsupervised structure learners. (Rao et al. 2020)](https://doi.org/10.1101/2020.12.15.422761)
|
| 458 |
+
The MSA Transformer (ESM-MSA-1) takes a multiple sequence alignment (MSA) as input, and uses the tied row self-attention maps in the same way.
|
| 459 |
+
See [MSA Transformer. (Rao et al. 2021)](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1).
|
| 460 |
+
|
| 461 |
+
To get unsupervised attention-based contacts, call `model.predict_contacts(tokens)` or `model(tokens, return_contacts=True)`.
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
### ESMStructuralSplitDataset and self-attention contact prediction
|
| 465 |
+
[<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/esm_structural_dataset.ipynb)
|
| 466 |
+
|
| 467 |
+
And this [jupyter notebook tutorial](examples/esm_structural_dataset.ipynb) shows how to load and index the `ESMStructuralSplitDataset`,
|
| 468 |
+
and computes the self-attention map unsupervised contact predictions using ESM-2.
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
## Available Models and Datasets <a name="available"></a>
|
| 472 |
+
|
| 473 |
+
### Pre-trained Models <a name="available-models"></a>
|
| 474 |
+
|
| 475 |
+
| Shorthand | `esm.pretrained.` | #layers | #params | Dataset | Embedding Dim | Model URL (automatically downloaded to `~/.cache/torch/hub/checkpoints`) |
|
| 476 |
+
|-----------|---------------------|---------|-------------|---------|---------------|-----------------------------------------------------------------------|
|
| 477 |
+
| ESM-2 | `esm2_t48_15B_UR50D` | 48 | 15B | UR50/D 2021_04 | 5120 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t48_15B_UR50D.pt |
|
| 478 |
+
| | `esm2_t36_3B_UR50D` | 36 | 3B | UR50/D 2021_04 | 2560 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t36_3B_UR50D.pt |
|
| 479 |
+
| | `esm2_t33_650M_UR50D` | 33 | 650M | UR50/D 2021_04 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt |
|
| 480 |
+
| | `esm2_t30_150M_UR50D` | 30 | 150M | UR50/D 2021_04 | 640 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t30_150M_UR50D.pt |
|
| 481 |
+
| | `esm2_t12_35M_UR50D` | 12 | 35M | UR50/D 2021_04 | 480 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t12_35M_UR50D.pt |
|
| 482 |
+
| | `esm2_t6_8M_UR50D` | 6 | 8M | UR50/D 2021_04 | 320 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt |
|
| 483 |
+
| ESMFold | `esmfold_v1` | 48 (+36) | 690M (+3B) | UR50/D 2021_04 | - | https://dl.fbaipublicfiles.com/fair-esm/models/esmfold_3B_v1.pt |
|
| 484 |
+
| | `esmfold_v0` | 48 (+36) | 690M (+3B) | UR50/D 2021_04 | - | https://dl.fbaipublicfiles.com/fair-esm/models/esmfold_3B_v0.pt |
|
| 485 |
+
| | `esmfold_structure_module_only_*` | 0 (+various) | various | UR50/D 2021_04 | - | https://dl.fbaipublicfiles.com/fair-esm/models/esmfold_structure_module_only_* |
|
| 486 |
+
| ESM-IF1 | `esm_if1_gvp4_t16_142M_UR50` | 20 | 124M | CATH 4.3 + predicted structures for UR50 | 512 | https://dl.fbaipublicfiles.com/fair-esm/models/esm_if1_gvp4_t16_142M_UR50.pt |
|
| 487 |
+
| ESM-1v | `esm1v_t33_650M_UR90S_[1-5]` | 33 | 650M | UR90/S 2020_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_1.pt |
|
| 488 |
+
| ESM-MSA-1b| `esm_msa1b_t12_100M_UR50S` | 12 | 100M | UR50/S + MSA 2018_03 | 768 | https://dl.fbaipublicfiles.com/fair-esm/models/esm_msa1b_t12_100M_UR50S.pt |
|
| 489 |
+
| ESM-MSA-1 | `esm_msa1_t12_100M_UR50S` | 12 | 100M | UR50/S + MSA 2018_03 | 768 | https://dl.fbaipublicfiles.com/fair-esm/models/esm_msa1_t12_100M_UR50S.pt |
|
| 490 |
+
| ESM-1b | `esm1b_t33_650M_UR50S` | 33 | 650M | UR50/S 2018_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1b_t33_650M_UR50S.pt |
|
| 491 |
+
| ESM-1 | `esm1_t34_670M_UR50S` | 34 | 670M | UR50/S 2018_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t34_670M_UR50S.pt |
|
| 492 |
+
| | `esm1_t34_670M_UR50D` | 34 | 670M | UR50/D 2018_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t34_670M_UR50D.pt |
|
| 493 |
+
| | `esm1_t34_670M_UR100` | 34 | 670M | UR100 2018_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t34_670M_UR100.pt |
|
| 494 |
+
| | `esm1_t12_85M_UR50S` | 12 | 85M | UR50/S 2018_03 | 768 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t12_85M_UR50S.pt |
|
| 495 |
+
| | `esm1_t6_43M_UR50S` | 6 | 43M | UR50/S 2018_03 | 768 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t6_43M_UR50S.pt |
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
Here is a chronological list of the released models and the paper they were introduced in:
|
| 499 |
+
|
| 500 |
+
| Shorthand | Release Notes |
|
| 501 |
+
|------------|---------------|
|
| 502 |
+
| ESM-1 | Released with Rives et al. 2019 (Aug 2020 update). |
|
| 503 |
+
| ESM-1b | Released with Rives et al. 2019 (Dec 2020 update). See Appendix B. |
|
| 504 |
+
| ESM-MSA-1 | Released with Rao et al. 2021 (Preprint v1). |
|
| 505 |
+
| ESM-MSA-1b | Released with Rao et al. 2021 (ICML'21 version, June 2021). |
|
| 506 |
+
| ESM-1v | Released with Meier et al. 2021. |
|
| 507 |
+
| ESM-IF1 | Released with Hsu et al. 2022. |
|
| 508 |
+
| ESM-2 | Released with Lin et al. 2022. |
|
| 509 |
+
|
| 510 |
+
### ESM Structural Split Dataset <a name="available-esmssd"></a>
|
| 511 |
+
This is a five-fold cross validation dataset of protein domain structures that can be used to measure generalization of representations
|
| 512 |
+
across different levels of structural dissimilarity.
|
| 513 |
+
The dataset implements structural holdouts at the family, superfamily, and fold
|
| 514 |
+
level. The SCOPe database is used to classify domains. Independently for each level of structural hold-out,
|
| 515 |
+
the domains are split into 5 equal sets, i.e. five sets of folds, superfamilies, or families. This ensures
|
| 516 |
+
that for each of the five partitions, structures having the same classification do not appear in both the
|
| 517 |
+
train and test sets. For a given classification level each structure appears in a test set once, so that
|
| 518 |
+
in the cross validation experiment each of the structures will be evaluated exactly once.
|
| 519 |
+
|
| 520 |
+
The dataset provides 3d coordinates, distance maps, and secondary structure labels.
|
| 521 |
+
For further details on the construction of the dataset
|
| 522 |
+
see [Rives et al. 2019](https://doi.org/10.1101/622803) Appendix A.10.
|
| 523 |
+
|
| 524 |
+
This [jupyter notebook tutorial](examples/esm_structural_dataset.ipynb) shows how to load and index the `ESMStructuralSplitDataset`.
|
| 525 |
+
|
| 526 |
+
`ESMStructuralSplitDataset`, upon initializing, will download `splits` and `pkl`.
|
| 527 |
+
We also provide `msas` for each of the domains. The data can be directly downloaded below.
|
| 528 |
+
|
| 529 |
+
| Name | Description | URL |
|
| 530 |
+
|--------|-------------------------------------------------------------------------------|-----------------------------------------------------------------------|
|
| 531 |
+
| splits | train/valid splits | https://dl.fbaipublicfiles.com/fair-esm/structural-data/splits.tar.gz |
|
| 532 |
+
| pkl | pkl objects containing sequence, SSP labels, distance map, and 3d coordinates | https://dl.fbaipublicfiles.com/fair-esm/structural-data/pkl.tar.gz |
|
| 533 |
+
| msas | a3m files containing MSA for each domain | https://dl.fbaipublicfiles.com/fair-esm/structural-data/msas.tar.gz |
|
| 534 |
+
|
| 535 |
+
### Pre-training Dataset Split <a name="available-pretraining-split"></a>
|
| 536 |
+
The split files establishing which UniRef50 clusters were used as held-out evaluation set for pre-training
|
| 537 |
+
in [Rives et al. 2019](https://doi.org/10.1101/622803) and [Rao et al. 2021](https://doi.org/10.1101/2021.02.12.430858) can be found here:
|
| 538 |
+
* [UniRef50 IDs of evaluation set](https://dl.fbaipublicfiles.com/fair-esm/pretraining-data/uniref201803_ur50_valid_headers.txt.gz): 3.016 M clusters
|
| 539 |
+
* [UniRef100 IDs of evaluation set](https://dl.fbaipublicfiles.com/fair-esm/pretraining-data/uniref201803_ur100_valid_headers.txt.gz): 13.745 M proteins, expanding the same UniRef50 clusters.
|
| 540 |
+
|
| 541 |
+
These files only contain only the UniRef50 IDs and UniRef100 IDs corresponding to the [UniRef database, 2018-03 release](https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2018_03/uniref/)
|
| 542 |
+
which is released by the UniProt Consortium under a [Creative Commons Attribution (CC BY 4.0) License](https://www.uniprot.org/help/license).
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
### Comparison to related works <a name="perf_related"></a>
|
| 546 |
+
<!--
|
| 547 |
+
DO NOT EDIT THIS TABLE! This is the source of truth:
|
| 548 |
+
https://docs.google.com/spreadsheets/d/1RPvWF47rIMEr-Jg-SRCoGElHcwCl5d7RyEeSyPgp59A/edit#gid=0
|
| 549 |
+
exported via https://www.tablesgenerator.com/html_tables
|
| 550 |
+
-->
|
| 551 |
+
|
| 552 |
+
<table class="tg">
|
| 553 |
+
<thead>
|
| 554 |
+
<tr>
|
| 555 |
+
<th class="tg-0thz"><span style="font-weight:bold">Task</span></th>
|
| 556 |
+
<th class="tg-j6zm" colspan="3"><span style="font-weight:bold">Unsupervised contact prediction</span></th>
|
| 557 |
+
<th class="tg-j6zm" colspan="2"><span style="font-weight:bold">Structure Prediction</span></th>
|
| 558 |
+
</tr>
|
| 559 |
+
</thead>
|
| 560 |
+
<tbody>
|
| 561 |
+
<tr>
|
| 562 |
+
<td class="tg-j6zm"><span style="font-weight:bold">Test set</span></td>
|
| 563 |
+
<td class="tg-j6zm"><span style="font-weight:bold">Large valid</span></td>
|
| 564 |
+
<td class="tg-j6zm"><span style="font-weight:bold">CASP14</span></td>
|
| 565 |
+
<td class="tg-j6zm"><span style="font-weight:bold">CAMEO (Apr-Jun 2022)</span></td>
|
| 566 |
+
<td class="tg-j6zm"><span style="font-weight:bold">CASP14</span></td>
|
| 567 |
+
<td class="tg-j6zm"><span style="font-weight:bold">CAMEO (Apr-Jun 2022)</span></td>
|
| 568 |
+
</tr>
|
| 569 |
+
<tr>
|
| 570 |
+
<td class="tg-7zrl">Gremlin (Potts)</td>
|
| 571 |
+
<td class="tg-7zrl">39.3</td>
|
| 572 |
+
<td class="tg-7zrl"></td>
|
| 573 |
+
<td class="tg-7zrl"></td>
|
| 574 |
+
<td class="tg-7zrl"></td>
|
| 575 |
+
<td class="tg-7zrl"></td>
|
| 576 |
+
</tr>
|
| 577 |
+
<tr>
|
| 578 |
+
<td class="tg-7zrl">TAPE</td>
|
| 579 |
+
<td class="tg-7zrl">11.2</td>
|
| 580 |
+
<td class="tg-7zrl"></td>
|
| 581 |
+
<td class="tg-7zrl"></td>
|
| 582 |
+
<td class="tg-7zrl"></td>
|
| 583 |
+
<td class="tg-7zrl"></td>
|
| 584 |
+
</tr>
|
| 585 |
+
<tr>
|
| 586 |
+
<td class="tg-7zrl">ProtBert-BFD</td>
|
| 587 |
+
<td class="tg-7zrl">34.1</td>
|
| 588 |
+
<td class="tg-7zrl"></td>
|
| 589 |
+
<td class="tg-7zrl"></td>
|
| 590 |
+
<td class="tg-7zrl"></td>
|
| 591 |
+
<td class="tg-7zrl"></td>
|
| 592 |
+
</tr>
|
| 593 |
+
<tr>
|
| 594 |
+
<td class="tg-7zrl">Prot-T5-XL-BFD</td>
|
| 595 |
+
<td class="tg-7zrl">35.6</td>
|
| 596 |
+
<td class="tg-7zrl"></td>
|
| 597 |
+
<td class="tg-7zrl"></td>
|
| 598 |
+
<td class="tg-2b7s">46.1</td>
|
| 599 |
+
<td class="tg-2b7s">62.6</td>
|
| 600 |
+
</tr>
|
| 601 |
+
<tr>
|
| 602 |
+
<td class="tg-7zrl">Prot-T5-XL-Ur50 (3B)</td>
|
| 603 |
+
<td class="tg-7zrl">47.9</td>
|
| 604 |
+
<td class="tg-7zrl"></td>
|
| 605 |
+
<td class="tg-7zrl"></td>
|
| 606 |
+
<td class="tg-2b7s">49.8</td>
|
| 607 |
+
<td class="tg-2b7s">69.4</td>
|
| 608 |
+
</tr>
|
| 609 |
+
<tr>
|
| 610 |
+
<td class="tg-7zrl">ESM-1</td>
|
| 611 |
+
<td class="tg-7zrl">33.7</td>
|
| 612 |
+
<td class="tg-7zrl"></td>
|
| 613 |
+
<td class="tg-7zrl"></td>
|
| 614 |
+
<td class="tg-7zrl"></td>
|
| 615 |
+
<td class="tg-7zrl"></td>
|
| 616 |
+
</tr>
|
| 617 |
+
<tr>
|
| 618 |
+
<td class="tg-7zrl">ESM-1b</td>
|
| 619 |
+
<td class="tg-7zrl">41.1</td>
|
| 620 |
+
<td class="tg-7zrl">24.4</td>
|
| 621 |
+
<td class="tg-7zrl">39</td>
|
| 622 |
+
<td class="tg-2b7s">41.6</td>
|
| 623 |
+
<td class="tg-2b7s">64.5</td>
|
| 624 |
+
</tr>
|
| 625 |
+
<tr>
|
| 626 |
+
<td class="tg-7zrl">ESM-1v</td>
|
| 627 |
+
<td class="tg-7zrl">35.3</td>
|
| 628 |
+
<td class="tg-7zrl"></td>
|
| 629 |
+
<td class="tg-7zrl"></td>
|
| 630 |
+
<td class="tg-7zrl"></td>
|
| 631 |
+
<td class="tg-7zrl"></td>
|
| 632 |
+
</tr>
|
| 633 |
+
<tr>
|
| 634 |
+
<td class="tg-7zrl">ESM-MSA-1b</td>
|
| 635 |
+
<td class="tg-7zrl">57.4</td>
|
| 636 |
+
<td class="tg-7zrl"></td>
|
| 637 |
+
<td class="tg-7zrl"></td>
|
| 638 |
+
<td class="tg-7zrl"></td>
|
| 639 |
+
<td class="tg-7zrl"></td>
|
| 640 |
+
</tr>
|
| 641 |
+
<tr>
|
| 642 |
+
<td class="tg-7zrl">ESM-2 (8M)</td>
|
| 643 |
+
<td class="tg-7zrl">15.9</td>
|
| 644 |
+
<td class="tg-7zrl">9.8</td>
|
| 645 |
+
<td class="tg-7zrl">15.7</td>
|
| 646 |
+
<td class="tg-2b7s">36.7</td>
|
| 647 |
+
<td class="tg-2b7s">48.1</td>
|
| 648 |
+
</tr>
|
| 649 |
+
<tr>
|
| 650 |
+
<td class="tg-7zrl">ESM-2 (35M)</td>
|
| 651 |
+
<td class="tg-7zrl">28.8</td>
|
| 652 |
+
<td class="tg-7zrl">16.4</td>
|
| 653 |
+
<td class="tg-7zrl">28.4</td>
|
| 654 |
+
<td class="tg-2b7s">41.4</td>
|
| 655 |
+
<td class="tg-2b7s">56.4</td>
|
| 656 |
+
</tr>
|
| 657 |
+
<tr>
|
| 658 |
+
<td class="tg-7zrl">ESM-2 (150M)</td>
|
| 659 |
+
<td class="tg-7zrl">42.2</td>
|
| 660 |
+
<td class="tg-7zrl">26.8</td>
|
| 661 |
+
<td class="tg-7zrl">40.1</td>
|
| 662 |
+
<td class="tg-2b7s">49.0</td>
|
| 663 |
+
<td class="tg-2b7s">64.9</td>
|
| 664 |
+
</tr>
|
| 665 |
+
<tr>
|
| 666 |
+
<td class="tg-7zrl">ESM-2 (700M)</td>
|
| 667 |
+
<td class="tg-7zrl">50.1</td>
|
| 668 |
+
<td class="tg-7zrl">32.5</td>
|
| 669 |
+
<td class="tg-7zrl">47.6</td>
|
| 670 |
+
<td class="tg-2b7s">51.3</td>
|
| 671 |
+
<td class="tg-2b7s">70.1</td>
|
| 672 |
+
</tr>
|
| 673 |
+
<tr>
|
| 674 |
+
<td class="tg-7zrl">ESM-2 (3B)</td>
|
| 675 |
+
<td class="tg-7zrl">52.7</td>
|
| 676 |
+
<td class="tg-7zrl">34.0</td>
|
| 677 |
+
<td class="tg-7zrl">49.9</td>
|
| 678 |
+
<td class="tg-2b7s">52.5</td>
|
| 679 |
+
<td class="tg-2b7s">71.8</td>
|
| 680 |
+
</tr>
|
| 681 |
+
<tr>
|
| 682 |
+
<td class="tg-7zrl">ESM-2 (15B)</td>
|
| 683 |
+
<td class="tg-7zrl">54.5</td>
|
| 684 |
+
<td class="tg-7zrl">37.0</td>
|
| 685 |
+
<td class="tg-7zrl">51.7</td>
|
| 686 |
+
<td class="tg-2b7s">55.4</td>
|
| 687 |
+
<td class="tg-2b7s">72.1</td>
|
| 688 |
+
</tr>
|
| 689 |
+
</tbody>
|
| 690 |
+
</table>
|
| 691 |
+
|
| 692 |
+
Comparison to related protein language models on structure prediction tasks.
|
| 693 |
+
|
| 694 |
+
* All contact numbers are the top-L,LR precision metric, where long range means sequence separation of at least 24 residues
|
| 695 |
+
* For unsupervised contact prediction, a sparse linear combination of the attention heads is used to directly predict protein contacts,
|
| 696 |
+
fitted with logistic regression on 20 structures.
|
| 697 |
+
For more details on the method, see [Rao et al. 2020](https://doi.org/10.1101/2020.12.15.422761).
|
| 698 |
+
* For structure prediction, an AlphaFold2 structure module is trained directly from the frozen language model embeddings.
|
| 699 |
+
For more details on the method, see [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574).
|
| 700 |
+
* Direct coupling analysis methods (Gremlin, mfDCA, Psicov) and ESM-MSA-1 use the [trRosetta MSAs](https://yanglab.nankai.edu.cn/trRosetta/benchmark/), while other methods predict from single sequence.
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
## Citations <a name="citations"></a>
|
| 704 |
+
|
| 705 |
+
If you find the models useful in your research, we ask that you cite the relevant paper:
|
| 706 |
+
|
| 707 |
+
```bibtex
|
| 708 |
+
@article{rives2019biological,
|
| 709 |
+
author={Rives, Alexander and Meier, Joshua and Sercu, Tom and Goyal, Siddharth and Lin, Zeming and Liu, Jason and Guo, Demi and Ott, Myle and Zitnick, C. Lawrence and Ma, Jerry and Fergus, Rob},
|
| 710 |
+
title={Biological Structure and Function Emerge from Scaling Unsupervised Learning to 250 Million Protein Sequences},
|
| 711 |
+
year={2019},
|
| 712 |
+
doi={10.1101/622803},
|
| 713 |
+
url={https://www.biorxiv.org/content/10.1101/622803v4},
|
| 714 |
+
journal={PNAS}
|
| 715 |
+
}
|
| 716 |
+
```
|
| 717 |
+
|
| 718 |
+
For the self-attention contact prediction:
|
| 719 |
+
|
| 720 |
+
```bibtex
|
| 721 |
+
@article{rao2020transformer,
|
| 722 |
+
author = {Rao, Roshan M and Meier, Joshua and Sercu, Tom and Ovchinnikov, Sergey and Rives, Alexander},
|
| 723 |
+
title={Transformer protein language models are unsupervised structure learners},
|
| 724 |
+
year={2020},
|
| 725 |
+
doi={10.1101/2020.12.15.422761},
|
| 726 |
+
url={https://www.biorxiv.org/content/10.1101/2020.12.15.422761v1},
|
| 727 |
+
journal={bioRxiv}
|
| 728 |
+
}
|
| 729 |
+
```
|
| 730 |
+
|
| 731 |
+
For the MSA Transformer:
|
| 732 |
+
|
| 733 |
+
```bibtex
|
| 734 |
+
@article{rao2021msa,
|
| 735 |
+
author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier, Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom and Rives, Alexander},
|
| 736 |
+
title={MSA Transformer},
|
| 737 |
+
year={2021},
|
| 738 |
+
doi={10.1101/2021.02.12.430858},
|
| 739 |
+
url={https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1},
|
| 740 |
+
journal={bioRxiv}
|
| 741 |
+
}
|
| 742 |
+
```
|
| 743 |
+
|
| 744 |
+
For variant prediction using ESM-1v:
|
| 745 |
+
|
| 746 |
+
```bibtex
|
| 747 |
+
@article{meier2021language,
|
| 748 |
+
author = {Meier, Joshua and Rao, Roshan and Verkuil, Robert and Liu, Jason and Sercu, Tom and Rives, Alexander},
|
| 749 |
+
title = {Language models enable zero-shot prediction of the effects of mutations on protein function},
|
| 750 |
+
year={2021},
|
| 751 |
+
doi={10.1101/2021.07.09.450648},
|
| 752 |
+
url={https://www.biorxiv.org/content/10.1101/2021.07.09.450648v1},
|
| 753 |
+
journal={bioRxiv}
|
| 754 |
+
}
|
| 755 |
+
```
|
| 756 |
+
|
| 757 |
+
For inverse folding using ESM-IF1:
|
| 758 |
+
|
| 759 |
+
```bibtex
|
| 760 |
+
@article{hsu2022learning,
|
| 761 |
+
author = {Hsu, Chloe and Verkuil, Robert and Liu, Jason and Lin, Zeming and Hie, Brian and Sercu, Tom and Lerer, Adam and Rives, Alexander},
|
| 762 |
+
title = {Learning inverse folding from millions of predicted structures},
|
| 763 |
+
year = {2022},
|
| 764 |
+
doi = {10.1101/2022.04.10.487779},
|
| 765 |
+
url = {https://www.biorxiv.org/content/early/2022/04/10/2022.04.10.487779},
|
| 766 |
+
journal = {ICML}
|
| 767 |
+
}
|
| 768 |
+
```
|
| 769 |
+
|
| 770 |
+
For the ESM-2 language model and ESMFold:
|
| 771 |
+
|
| 772 |
+
```bibtex
|
| 773 |
+
@article{lin2022language,
|
| 774 |
+
title={Language models of protein sequences at the scale of evolution enable accurate structure prediction},
|
| 775 |
+
author={Lin, Zeming and Akin, Halil and Rao, Roshan and Hie, Brian and Zhu, Zhongkai and Lu, Wenting and Smetanin, Nikita and dos Santos Costa, Allan and Fazel-Zarandi, Maryam and Sercu, Tom and Candido, Sal and others},
|
| 776 |
+
journal={bioRxiv},
|
| 777 |
+
year={2022},
|
| 778 |
+
publisher={Cold Spring Harbor Laboratory}
|
| 779 |
+
}
|
| 780 |
+
```
|
| 781 |
+
|
| 782 |
+
Much of this code builds on the [fairseq](https://github.com/pytorch/fairseq) sequence modeling framework. We use fairseq internally for our protein language modeling research. We highly recommend trying it out if you'd like to pre-train protein language models from scratch.
|
| 783 |
+
|
| 784 |
+
Additionally, if you would like to use the variant prediction benchmark from Meier et al. (2021), we provide a bibtex file with citations for all data in [./examples/variant-prediction/mutation_data.bib](./examples/variant-prediction/mutation_data.bib). You can cite each paper individually, or add all citations in bulk using the LaTeX command:
|
| 785 |
+
|
| 786 |
+
```tex
|
| 787 |
+
\nocite{wrenbeck2017deep,klesmith2015comprehensive,haddox2018mapping,romero2015dissecting,firnberg2014comprehensive,deng2012deep,stiffler2015evolvability,jacquier2013capturing,findlay2018comprehensive,mclaughlin2012spatial,kitzman2015massively,doud2016accurate,pokusaeva2019experimental,mishra2016systematic,kelsic2016rna,melnikov2014comprehensive,brenan2016phenotypic,rockah2015systematic,wu2015functional,aakre2015evolving,qi2014quantitative,matreyek2018multiplex,bandaru2017deconstruction,roscoe2013analyses,roscoe2014systematic,mavor2016determination,chan2017correlation,melamed2013deep,starita2013activity,araya2012fundamental}
|
| 788 |
+
```
|
| 789 |
+
|
| 790 |
+
## License <a name="license"></a>
|
| 791 |
+
|
| 792 |
+
This source code is licensed under the MIT license found in the `LICENSE` file
|
| 793 |
+
in the root directory of this source tree.
|
| 794 |
+
|
| 795 |
+
ESM Metagenomic Atlas (also referred to as “ESM Metagenomic Structure Atlas” or “ESM Atlas”) data is available under a CC BY 4.0 license for academic and commercial use. Copyright (c) Meta Platforms, Inc. All Rights Reserved. Use of the ESM Metagenomic Atlas data is subject to the Meta Open Source [Terms of Use](https://opensource.fb.com/legal/terms/) and [Privacy Policy](https://opensource.fb.com/legal/privacy/).
|
esm/source/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
esm 项目包初始化文件
|
| 4 |
+
"""
|
esm/source/environment.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: esmfold
|
| 2 |
+
channels:
|
| 3 |
+
- conda-forge
|
| 4 |
+
- bioconda
|
| 5 |
+
- pytorch
|
| 6 |
+
dependencies:
|
| 7 |
+
- conda-forge::python=3.7
|
| 8 |
+
- conda-forge::setuptools=59.5.0
|
| 9 |
+
- conda-forge::pip
|
| 10 |
+
- conda-forge::openmm=7.5.1
|
| 11 |
+
- conda-forge::pdbfixer
|
| 12 |
+
- conda-forge::cudatoolkit==11.3.*
|
| 13 |
+
- conda-forge::einops
|
| 14 |
+
- conda-forge::fairscale
|
| 15 |
+
- conda-forge::omegaconf
|
| 16 |
+
- conda-forge::hydra-core
|
| 17 |
+
- conda-forge::pandas
|
| 18 |
+
- conda-forge::pytest
|
| 19 |
+
- bioconda::hmmer==3.3.2
|
| 20 |
+
- bioconda::hhsuite==3.3.0
|
| 21 |
+
- bioconda::kalign2==2.04
|
| 22 |
+
- pytorch::pytorch=1.12.*
|
| 23 |
+
- pip:
|
| 24 |
+
- biopython==1.79
|
| 25 |
+
- deepspeed==0.5.9
|
| 26 |
+
- dm-tree==0.1.6
|
| 27 |
+
- ml-collections==0.1.0
|
| 28 |
+
- numpy==1.21.2
|
| 29 |
+
- PyYAML==5.4.1
|
| 30 |
+
- requests==2.26.0
|
| 31 |
+
- scipy==1.7.1
|
| 32 |
+
- tqdm==4.62.2
|
| 33 |
+
- typing-extensions==3.10.0.2
|
| 34 |
+
- pytorch_lightning==1.5.10
|
| 35 |
+
- wandb==0.12.21
|
| 36 |
+
- git+https://github.com/NVIDIA/dllogger.git
|
esm/source/esm/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from .version import version as __version__ # noqa
|
| 7 |
+
|
| 8 |
+
from .data import Alphabet, BatchConverter, FastaBatchedDataset # noqa
|
| 9 |
+
from .model.esm1 import ProteinBertModel # noqa
|
| 10 |
+
from .model.esm2 import ESM2 # noqa
|
| 11 |
+
from .model.msa_transformer import MSATransformer #noqa
|
| 12 |
+
from . import pretrained # noqa
|
esm/source/esm/axial_attention.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
import torch
|
| 8 |
+
import torch.nn as nn
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class RowSelfAttention(nn.Module):
|
| 12 |
+
"""Compute self-attention over rows of a 2D input."""
|
| 13 |
+
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
embed_dim,
|
| 17 |
+
num_heads,
|
| 18 |
+
dropout=0.0,
|
| 19 |
+
max_tokens_per_msa: int = 2 ** 16,
|
| 20 |
+
):
|
| 21 |
+
super().__init__()
|
| 22 |
+
self.num_heads = num_heads
|
| 23 |
+
self.dropout = dropout
|
| 24 |
+
self.head_dim = embed_dim // num_heads
|
| 25 |
+
self.scaling = self.head_dim ** -0.5
|
| 26 |
+
self.max_tokens_per_msa = max_tokens_per_msa
|
| 27 |
+
self.attn_shape = "hnij"
|
| 28 |
+
|
| 29 |
+
self.k_proj = nn.Linear(embed_dim, embed_dim)
|
| 30 |
+
self.v_proj = nn.Linear(embed_dim, embed_dim)
|
| 31 |
+
self.q_proj = nn.Linear(embed_dim, embed_dim)
|
| 32 |
+
|
| 33 |
+
self.out_proj = nn.Linear(embed_dim, embed_dim)
|
| 34 |
+
self.dropout_module = nn.Dropout(dropout)
|
| 35 |
+
|
| 36 |
+
def align_scaling(self, q):
|
| 37 |
+
num_rows = q.size(0)
|
| 38 |
+
return self.scaling / math.sqrt(num_rows)
|
| 39 |
+
|
| 40 |
+
def _batched_forward(
|
| 41 |
+
self,
|
| 42 |
+
x,
|
| 43 |
+
self_attn_mask=None,
|
| 44 |
+
self_attn_padding_mask=None,
|
| 45 |
+
):
|
| 46 |
+
num_rows, num_cols, batch_size, embed_dim = x.size()
|
| 47 |
+
max_rows = max(1, self.max_tokens_per_msa // num_cols)
|
| 48 |
+
attns = 0
|
| 49 |
+
scaling = self.align_scaling(x)
|
| 50 |
+
for start in range(0, num_rows, max_rows):
|
| 51 |
+
attn_weights = self.compute_attention_weights(
|
| 52 |
+
x[start : start + max_rows],
|
| 53 |
+
scaling,
|
| 54 |
+
self_attn_mask=self_attn_mask,
|
| 55 |
+
self_attn_padding_mask=self_attn_padding_mask[:, start : start + max_rows]
|
| 56 |
+
if self_attn_padding_mask is not None
|
| 57 |
+
else None,
|
| 58 |
+
)
|
| 59 |
+
attns += attn_weights
|
| 60 |
+
attn_probs = attns.softmax(-1)
|
| 61 |
+
attn_probs = self.dropout_module(attn_probs)
|
| 62 |
+
|
| 63 |
+
outputs = []
|
| 64 |
+
for start in range(0, num_rows, max_rows):
|
| 65 |
+
output = self.compute_attention_update(x[start : start + max_rows], attn_probs)
|
| 66 |
+
outputs.append(output)
|
| 67 |
+
|
| 68 |
+
output = torch.cat(outputs, 0)
|
| 69 |
+
return output, attn_probs
|
| 70 |
+
|
| 71 |
+
def compute_attention_weights(
|
| 72 |
+
self,
|
| 73 |
+
x,
|
| 74 |
+
scaling: float,
|
| 75 |
+
self_attn_mask=None,
|
| 76 |
+
self_attn_padding_mask=None,
|
| 77 |
+
):
|
| 78 |
+
num_rows, num_cols, batch_size, embed_dim = x.size()
|
| 79 |
+
q = self.q_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
|
| 80 |
+
k = self.k_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
|
| 81 |
+
q *= scaling
|
| 82 |
+
if self_attn_padding_mask is not None:
|
| 83 |
+
# Zero out any padded aligned positions - this is important since
|
| 84 |
+
# we take a sum across the alignment axis.
|
| 85 |
+
q *= 1 - self_attn_padding_mask.permute(1, 2, 0).unsqueeze(3).unsqueeze(4).to(q)
|
| 86 |
+
|
| 87 |
+
attn_weights = torch.einsum(f"rinhd,rjnhd->{self.attn_shape}", q, k)
|
| 88 |
+
|
| 89 |
+
if self_attn_mask is not None:
|
| 90 |
+
raise NotImplementedError
|
| 91 |
+
# Mask Size: [B x R x C], Weights Size: [H x B x C x C]
|
| 92 |
+
|
| 93 |
+
if self_attn_padding_mask is not None:
|
| 94 |
+
attn_weights = attn_weights.masked_fill(
|
| 95 |
+
self_attn_padding_mask[:, 0].unsqueeze(0).unsqueeze(2),
|
| 96 |
+
-10000,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
return attn_weights
|
| 100 |
+
|
| 101 |
+
def compute_attention_update(
|
| 102 |
+
self,
|
| 103 |
+
x,
|
| 104 |
+
attn_probs,
|
| 105 |
+
):
|
| 106 |
+
num_rows, num_cols, batch_size, embed_dim = x.size()
|
| 107 |
+
v = self.v_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
|
| 108 |
+
context = torch.einsum(f"{self.attn_shape},rjnhd->rinhd", attn_probs, v)
|
| 109 |
+
context = context.contiguous().view(num_rows, num_cols, batch_size, embed_dim)
|
| 110 |
+
output = self.out_proj(context)
|
| 111 |
+
return output
|
| 112 |
+
|
| 113 |
+
def forward(
|
| 114 |
+
self,
|
| 115 |
+
x,
|
| 116 |
+
self_attn_mask=None,
|
| 117 |
+
self_attn_padding_mask=None,
|
| 118 |
+
):
|
| 119 |
+
num_rows, num_cols, batch_size, embed_dim = x.size()
|
| 120 |
+
if (num_rows * num_cols > self.max_tokens_per_msa) and not torch.is_grad_enabled():
|
| 121 |
+
return self._batched_forward(x, self_attn_mask, self_attn_padding_mask)
|
| 122 |
+
else:
|
| 123 |
+
scaling = self.align_scaling(x)
|
| 124 |
+
attn_weights = self.compute_attention_weights(
|
| 125 |
+
x, scaling, self_attn_mask, self_attn_padding_mask
|
| 126 |
+
)
|
| 127 |
+
attn_probs = attn_weights.softmax(-1)
|
| 128 |
+
attn_probs = self.dropout_module(attn_probs)
|
| 129 |
+
output = self.compute_attention_update(x, attn_probs)
|
| 130 |
+
return output, attn_probs
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class ColumnSelfAttention(nn.Module):
|
| 134 |
+
"""Compute self-attention over columns of a 2D input."""
|
| 135 |
+
|
| 136 |
+
def __init__(
|
| 137 |
+
self,
|
| 138 |
+
embed_dim,
|
| 139 |
+
num_heads,
|
| 140 |
+
dropout=0.0,
|
| 141 |
+
max_tokens_per_msa: int = 2 ** 16,
|
| 142 |
+
):
|
| 143 |
+
super().__init__()
|
| 144 |
+
|
| 145 |
+
self.num_heads = num_heads
|
| 146 |
+
self.dropout = dropout
|
| 147 |
+
self.head_dim = embed_dim // num_heads
|
| 148 |
+
self.scaling = self.head_dim ** -0.5
|
| 149 |
+
self.max_tokens_per_msa = max_tokens_per_msa
|
| 150 |
+
|
| 151 |
+
self.k_proj = nn.Linear(embed_dim, embed_dim)
|
| 152 |
+
self.v_proj = nn.Linear(embed_dim, embed_dim)
|
| 153 |
+
self.q_proj = nn.Linear(embed_dim, embed_dim)
|
| 154 |
+
|
| 155 |
+
self.out_proj = nn.Linear(embed_dim, embed_dim)
|
| 156 |
+
self.dropout_module = nn.Dropout(dropout)
|
| 157 |
+
|
| 158 |
+
def _batched_forward(
|
| 159 |
+
self,
|
| 160 |
+
x,
|
| 161 |
+
self_attn_mask=None,
|
| 162 |
+
self_attn_padding_mask=None,
|
| 163 |
+
):
|
| 164 |
+
num_rows, num_cols, batch_size, embed_dim = x.size()
|
| 165 |
+
max_cols = max(1, self.max_tokens_per_msa // num_rows)
|
| 166 |
+
outputs = []
|
| 167 |
+
attns = []
|
| 168 |
+
for start in range(0, num_cols, max_cols):
|
| 169 |
+
output, attn = self(
|
| 170 |
+
x[:, start : start + max_cols],
|
| 171 |
+
self_attn_mask=self_attn_mask,
|
| 172 |
+
self_attn_padding_mask=self_attn_padding_mask[:, :, start : start + max_cols]
|
| 173 |
+
if self_attn_padding_mask is not None
|
| 174 |
+
else None,
|
| 175 |
+
)
|
| 176 |
+
outputs.append(output)
|
| 177 |
+
attns.append(attn)
|
| 178 |
+
output = torch.cat(outputs, 1)
|
| 179 |
+
attns = torch.cat(attns, 1)
|
| 180 |
+
return output, attns
|
| 181 |
+
|
| 182 |
+
def compute_attention_update(
|
| 183 |
+
self,
|
| 184 |
+
x,
|
| 185 |
+
self_attn_mask=None,
|
| 186 |
+
self_attn_padding_mask=None,
|
| 187 |
+
):
|
| 188 |
+
num_rows, num_cols, batch_size, embed_dim = x.size()
|
| 189 |
+
if num_rows == 1:
|
| 190 |
+
# if there is only 1 position, this is equivalent and doesn't break with padding
|
| 191 |
+
attn_probs = torch.ones(
|
| 192 |
+
self.num_heads,
|
| 193 |
+
num_cols,
|
| 194 |
+
batch_size,
|
| 195 |
+
num_rows,
|
| 196 |
+
num_rows,
|
| 197 |
+
device=x.device,
|
| 198 |
+
dtype=x.dtype,
|
| 199 |
+
)
|
| 200 |
+
output = self.out_proj(self.v_proj(x))
|
| 201 |
+
else:
|
| 202 |
+
q = self.q_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
|
| 203 |
+
k = self.k_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
|
| 204 |
+
v = self.v_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
|
| 205 |
+
q *= self.scaling
|
| 206 |
+
|
| 207 |
+
attn_weights = torch.einsum("icnhd,jcnhd->hcnij", q, k)
|
| 208 |
+
|
| 209 |
+
if self_attn_mask is not None:
|
| 210 |
+
raise NotImplementedError
|
| 211 |
+
if self_attn_padding_mask is not None:
|
| 212 |
+
attn_weights = attn_weights.masked_fill(
|
| 213 |
+
self_attn_padding_mask.permute(2, 0, 1).unsqueeze(0).unsqueeze(3),
|
| 214 |
+
-10000,
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
attn_probs = attn_weights.softmax(-1)
|
| 218 |
+
attn_probs = self.dropout_module(attn_probs)
|
| 219 |
+
context = torch.einsum("hcnij,jcnhd->icnhd", attn_probs, v)
|
| 220 |
+
context = context.contiguous().view(num_rows, num_cols, batch_size, embed_dim)
|
| 221 |
+
output = self.out_proj(context)
|
| 222 |
+
return output, attn_probs
|
| 223 |
+
|
| 224 |
+
def forward(
|
| 225 |
+
self,
|
| 226 |
+
x,
|
| 227 |
+
self_attn_mask=None,
|
| 228 |
+
self_attn_padding_mask=None,
|
| 229 |
+
):
|
| 230 |
+
num_rows, num_cols, batch_size, embed_dim = x.size()
|
| 231 |
+
# if False and num_rows * num_cols > 2 ** 14 and not torch.is_grad_enabled():
|
| 232 |
+
if (num_rows * num_cols) > self.max_tokens_per_msa and not torch.is_grad_enabled():
|
| 233 |
+
return self._batched_forward(
|
| 234 |
+
x,
|
| 235 |
+
self_attn_mask,
|
| 236 |
+
self_attn_padding_mask,
|
| 237 |
+
)
|
| 238 |
+
else:
|
| 239 |
+
return self.compute_attention_update(x, self_attn_mask, self_attn_padding_mask)
|
esm/source/esm/constants.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
# fmt: off
|
| 7 |
+
proteinseq_toks = {
|
| 8 |
+
'toks': ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-']
|
| 9 |
+
}
|
| 10 |
+
# fmt: on
|
esm/source/esm/data.py
ADDED
|
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import itertools
|
| 7 |
+
import os
|
| 8 |
+
from typing import Sequence, Tuple, List, Union
|
| 9 |
+
import pickle
|
| 10 |
+
import re
|
| 11 |
+
import shutil
|
| 12 |
+
import torch
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from esm.constants import proteinseq_toks
|
| 15 |
+
|
| 16 |
+
RawMSA = Sequence[Tuple[str, str]]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class FastaBatchedDataset(object):
|
| 20 |
+
def __init__(self, sequence_labels, sequence_strs):
|
| 21 |
+
self.sequence_labels = list(sequence_labels)
|
| 22 |
+
self.sequence_strs = list(sequence_strs)
|
| 23 |
+
|
| 24 |
+
@classmethod
|
| 25 |
+
def from_file(cls, fasta_file):
|
| 26 |
+
sequence_labels, sequence_strs = [], []
|
| 27 |
+
cur_seq_label = None
|
| 28 |
+
buf = []
|
| 29 |
+
|
| 30 |
+
def _flush_current_seq():
|
| 31 |
+
nonlocal cur_seq_label, buf
|
| 32 |
+
if cur_seq_label is None:
|
| 33 |
+
return
|
| 34 |
+
sequence_labels.append(cur_seq_label)
|
| 35 |
+
sequence_strs.append("".join(buf))
|
| 36 |
+
cur_seq_label = None
|
| 37 |
+
buf = []
|
| 38 |
+
|
| 39 |
+
with open(fasta_file, "r") as infile:
|
| 40 |
+
for line_idx, line in enumerate(infile):
|
| 41 |
+
if line.startswith(">"): # label line
|
| 42 |
+
_flush_current_seq()
|
| 43 |
+
line = line[1:].strip()
|
| 44 |
+
if len(line) > 0:
|
| 45 |
+
cur_seq_label = line
|
| 46 |
+
else:
|
| 47 |
+
cur_seq_label = f"seqnum{line_idx:09d}"
|
| 48 |
+
else: # sequence line
|
| 49 |
+
buf.append(line.strip())
|
| 50 |
+
|
| 51 |
+
_flush_current_seq()
|
| 52 |
+
|
| 53 |
+
assert len(set(sequence_labels)) == len(
|
| 54 |
+
sequence_labels
|
| 55 |
+
), "Found duplicate sequence labels"
|
| 56 |
+
|
| 57 |
+
return cls(sequence_labels, sequence_strs)
|
| 58 |
+
|
| 59 |
+
def __len__(self):
|
| 60 |
+
return len(self.sequence_labels)
|
| 61 |
+
|
| 62 |
+
def __getitem__(self, idx):
|
| 63 |
+
return self.sequence_labels[idx], self.sequence_strs[idx]
|
| 64 |
+
|
| 65 |
+
def get_batch_indices(self, toks_per_batch, extra_toks_per_seq=0):
|
| 66 |
+
sizes = [(len(s), i) for i, s in enumerate(self.sequence_strs)]
|
| 67 |
+
sizes.sort()
|
| 68 |
+
batches = []
|
| 69 |
+
buf = []
|
| 70 |
+
max_len = 0
|
| 71 |
+
|
| 72 |
+
def _flush_current_buf():
|
| 73 |
+
nonlocal max_len, buf
|
| 74 |
+
if len(buf) == 0:
|
| 75 |
+
return
|
| 76 |
+
batches.append(buf)
|
| 77 |
+
buf = []
|
| 78 |
+
max_len = 0
|
| 79 |
+
|
| 80 |
+
for sz, i in sizes:
|
| 81 |
+
sz += extra_toks_per_seq
|
| 82 |
+
if max(sz, max_len) * (len(buf) + 1) > toks_per_batch:
|
| 83 |
+
_flush_current_buf()
|
| 84 |
+
max_len = max(max_len, sz)
|
| 85 |
+
buf.append(i)
|
| 86 |
+
|
| 87 |
+
_flush_current_buf()
|
| 88 |
+
return batches
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class Alphabet(object):
|
| 92 |
+
def __init__(
|
| 93 |
+
self,
|
| 94 |
+
standard_toks: Sequence[str],
|
| 95 |
+
prepend_toks: Sequence[str] = ("<null_0>", "<pad>", "<eos>", "<unk>"),
|
| 96 |
+
append_toks: Sequence[str] = ("<cls>", "<mask>", "<sep>"),
|
| 97 |
+
prepend_bos: bool = True,
|
| 98 |
+
append_eos: bool = False,
|
| 99 |
+
use_msa: bool = False,
|
| 100 |
+
):
|
| 101 |
+
self.standard_toks = list(standard_toks)
|
| 102 |
+
self.prepend_toks = list(prepend_toks)
|
| 103 |
+
self.append_toks = list(append_toks)
|
| 104 |
+
self.prepend_bos = prepend_bos
|
| 105 |
+
self.append_eos = append_eos
|
| 106 |
+
self.use_msa = use_msa
|
| 107 |
+
|
| 108 |
+
self.all_toks = list(self.prepend_toks)
|
| 109 |
+
self.all_toks.extend(self.standard_toks)
|
| 110 |
+
for i in range((8 - (len(self.all_toks) % 8)) % 8):
|
| 111 |
+
self.all_toks.append(f"<null_{i + 1}>")
|
| 112 |
+
self.all_toks.extend(self.append_toks)
|
| 113 |
+
|
| 114 |
+
self.tok_to_idx = {tok: i for i, tok in enumerate(self.all_toks)}
|
| 115 |
+
|
| 116 |
+
self.unk_idx = self.tok_to_idx["<unk>"]
|
| 117 |
+
self.padding_idx = self.get_idx("<pad>")
|
| 118 |
+
self.cls_idx = self.get_idx("<cls>")
|
| 119 |
+
self.mask_idx = self.get_idx("<mask>")
|
| 120 |
+
self.eos_idx = self.get_idx("<eos>")
|
| 121 |
+
self.all_special_tokens = ['<eos>', '<unk>', '<pad>', '<cls>', '<mask>']
|
| 122 |
+
self.unique_no_split_tokens = self.all_toks
|
| 123 |
+
|
| 124 |
+
def __len__(self):
|
| 125 |
+
return len(self.all_toks)
|
| 126 |
+
|
| 127 |
+
def get_idx(self, tok):
|
| 128 |
+
return self.tok_to_idx.get(tok, self.unk_idx)
|
| 129 |
+
|
| 130 |
+
def get_tok(self, ind):
|
| 131 |
+
return self.all_toks[ind]
|
| 132 |
+
|
| 133 |
+
def to_dict(self):
|
| 134 |
+
return self.tok_to_idx.copy()
|
| 135 |
+
|
| 136 |
+
def get_batch_converter(self, truncation_seq_length: int = None):
|
| 137 |
+
if self.use_msa:
|
| 138 |
+
return MSABatchConverter(self, truncation_seq_length)
|
| 139 |
+
else:
|
| 140 |
+
return BatchConverter(self, truncation_seq_length)
|
| 141 |
+
|
| 142 |
+
@classmethod
|
| 143 |
+
def from_architecture(cls, name: str) -> "Alphabet":
|
| 144 |
+
if name in ("ESM-1", "protein_bert_base"):
|
| 145 |
+
standard_toks = proteinseq_toks["toks"]
|
| 146 |
+
prepend_toks: Tuple[str, ...] = ("<null_0>", "<pad>", "<eos>", "<unk>")
|
| 147 |
+
append_toks: Tuple[str, ...] = ("<cls>", "<mask>", "<sep>")
|
| 148 |
+
prepend_bos = True
|
| 149 |
+
append_eos = False
|
| 150 |
+
use_msa = False
|
| 151 |
+
elif name in ("ESM-1b", "roberta_large"):
|
| 152 |
+
standard_toks = proteinseq_toks["toks"]
|
| 153 |
+
prepend_toks = ("<cls>", "<pad>", "<eos>", "<unk>")
|
| 154 |
+
append_toks = ("<mask>",)
|
| 155 |
+
prepend_bos = True
|
| 156 |
+
append_eos = True
|
| 157 |
+
use_msa = False
|
| 158 |
+
elif name in ("MSA Transformer", "msa_transformer"):
|
| 159 |
+
standard_toks = proteinseq_toks["toks"]
|
| 160 |
+
prepend_toks = ("<cls>", "<pad>", "<eos>", "<unk>")
|
| 161 |
+
append_toks = ("<mask>",)
|
| 162 |
+
prepend_bos = True
|
| 163 |
+
append_eos = False
|
| 164 |
+
use_msa = True
|
| 165 |
+
elif "invariant_gvp" in name.lower():
|
| 166 |
+
standard_toks = proteinseq_toks["toks"]
|
| 167 |
+
prepend_toks = ("<null_0>", "<pad>", "<eos>", "<unk>")
|
| 168 |
+
append_toks = ("<mask>", "<cath>", "<af2>")
|
| 169 |
+
prepend_bos = True
|
| 170 |
+
append_eos = False
|
| 171 |
+
use_msa = False
|
| 172 |
+
else:
|
| 173 |
+
raise ValueError("Unknown architecture selected")
|
| 174 |
+
return cls(standard_toks, prepend_toks, append_toks, prepend_bos, append_eos, use_msa)
|
| 175 |
+
|
| 176 |
+
def _tokenize(self, text) -> str:
|
| 177 |
+
return text.split()
|
| 178 |
+
|
| 179 |
+
def tokenize(self, text, **kwargs) -> List[str]:
|
| 180 |
+
"""
|
| 181 |
+
Inspired by https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py
|
| 182 |
+
Converts a string in a sequence of tokens, using the tokenizer.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
text (:obj:`str`):
|
| 186 |
+
The sequence to be encoded.
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
:obj:`List[str]`: The list of tokens.
|
| 190 |
+
"""
|
| 191 |
+
|
| 192 |
+
def split_on_token(tok, text):
|
| 193 |
+
result = []
|
| 194 |
+
split_text = text.split(tok)
|
| 195 |
+
for i, sub_text in enumerate(split_text):
|
| 196 |
+
# AddedToken can control whitespace stripping around them.
|
| 197 |
+
# We use them for GPT2 and Roberta to have different behavior depending on the special token
|
| 198 |
+
# Cf. https://github.com/huggingface/transformers/pull/2778
|
| 199 |
+
# and https://github.com/huggingface/transformers/issues/3788
|
| 200 |
+
# We strip left and right by default
|
| 201 |
+
if i < len(split_text) - 1:
|
| 202 |
+
sub_text = sub_text.rstrip()
|
| 203 |
+
if i > 0:
|
| 204 |
+
sub_text = sub_text.lstrip()
|
| 205 |
+
|
| 206 |
+
if i == 0 and not sub_text:
|
| 207 |
+
result.append(tok)
|
| 208 |
+
elif i == len(split_text) - 1:
|
| 209 |
+
if sub_text:
|
| 210 |
+
result.append(sub_text)
|
| 211 |
+
else:
|
| 212 |
+
pass
|
| 213 |
+
else:
|
| 214 |
+
if sub_text:
|
| 215 |
+
result.append(sub_text)
|
| 216 |
+
result.append(tok)
|
| 217 |
+
return result
|
| 218 |
+
|
| 219 |
+
def split_on_tokens(tok_list, text):
|
| 220 |
+
if not text.strip():
|
| 221 |
+
return []
|
| 222 |
+
|
| 223 |
+
tokenized_text = []
|
| 224 |
+
text_list = [text]
|
| 225 |
+
for tok in tok_list:
|
| 226 |
+
tokenized_text = []
|
| 227 |
+
for sub_text in text_list:
|
| 228 |
+
if sub_text not in self.unique_no_split_tokens:
|
| 229 |
+
tokenized_text.extend(split_on_token(tok, sub_text))
|
| 230 |
+
else:
|
| 231 |
+
tokenized_text.append(sub_text)
|
| 232 |
+
text_list = tokenized_text
|
| 233 |
+
|
| 234 |
+
return list(
|
| 235 |
+
itertools.chain.from_iterable(
|
| 236 |
+
(
|
| 237 |
+
self._tokenize(token)
|
| 238 |
+
if token not in self.unique_no_split_tokens
|
| 239 |
+
else [token]
|
| 240 |
+
for token in tokenized_text
|
| 241 |
+
)
|
| 242 |
+
)
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
no_split_token = self.unique_no_split_tokens
|
| 246 |
+
tokenized_text = split_on_tokens(no_split_token, text)
|
| 247 |
+
return tokenized_text
|
| 248 |
+
|
| 249 |
+
def encode(self, text):
|
| 250 |
+
return [self.tok_to_idx[tok] for tok in self.tokenize(text)]
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
class BatchConverter(object):
|
| 254 |
+
"""Callable to convert an unprocessed (labels + strings) batch to a
|
| 255 |
+
processed (labels + tensor) batch.
|
| 256 |
+
"""
|
| 257 |
+
|
| 258 |
+
def __init__(self, alphabet, truncation_seq_length: int = None):
|
| 259 |
+
self.alphabet = alphabet
|
| 260 |
+
self.truncation_seq_length = truncation_seq_length
|
| 261 |
+
|
| 262 |
+
def __call__(self, raw_batch: Sequence[Tuple[str, str]]):
|
| 263 |
+
# RoBERTa uses an eos token, while ESM-1 does not.
|
| 264 |
+
batch_size = len(raw_batch)
|
| 265 |
+
batch_labels, seq_str_list = zip(*raw_batch)
|
| 266 |
+
seq_encoded_list = [self.alphabet.encode(seq_str) for seq_str in seq_str_list]
|
| 267 |
+
if self.truncation_seq_length:
|
| 268 |
+
seq_encoded_list = [seq_str[:self.truncation_seq_length] for seq_str in seq_encoded_list]
|
| 269 |
+
max_len = max(len(seq_encoded) for seq_encoded in seq_encoded_list)
|
| 270 |
+
tokens = torch.empty(
|
| 271 |
+
(
|
| 272 |
+
batch_size,
|
| 273 |
+
max_len + int(self.alphabet.prepend_bos) + int(self.alphabet.append_eos),
|
| 274 |
+
),
|
| 275 |
+
dtype=torch.int64,
|
| 276 |
+
)
|
| 277 |
+
tokens.fill_(self.alphabet.padding_idx)
|
| 278 |
+
labels = []
|
| 279 |
+
strs = []
|
| 280 |
+
|
| 281 |
+
for i, (label, seq_str, seq_encoded) in enumerate(
|
| 282 |
+
zip(batch_labels, seq_str_list, seq_encoded_list)
|
| 283 |
+
):
|
| 284 |
+
labels.append(label)
|
| 285 |
+
strs.append(seq_str)
|
| 286 |
+
if self.alphabet.prepend_bos:
|
| 287 |
+
tokens[i, 0] = self.alphabet.cls_idx
|
| 288 |
+
seq = torch.tensor(seq_encoded, dtype=torch.int64)
|
| 289 |
+
tokens[
|
| 290 |
+
i,
|
| 291 |
+
int(self.alphabet.prepend_bos) : len(seq_encoded)
|
| 292 |
+
+ int(self.alphabet.prepend_bos),
|
| 293 |
+
] = seq
|
| 294 |
+
if self.alphabet.append_eos:
|
| 295 |
+
tokens[i, len(seq_encoded) + int(self.alphabet.prepend_bos)] = self.alphabet.eos_idx
|
| 296 |
+
|
| 297 |
+
return labels, strs, tokens
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
class MSABatchConverter(BatchConverter):
|
| 301 |
+
def __call__(self, inputs: Union[Sequence[RawMSA], RawMSA]):
|
| 302 |
+
if isinstance(inputs[0][0], str):
|
| 303 |
+
# Input is a single MSA
|
| 304 |
+
raw_batch: Sequence[RawMSA] = [inputs] # type: ignore
|
| 305 |
+
else:
|
| 306 |
+
raw_batch = inputs # type: ignore
|
| 307 |
+
|
| 308 |
+
batch_size = len(raw_batch)
|
| 309 |
+
max_alignments = max(len(msa) for msa in raw_batch)
|
| 310 |
+
max_seqlen = max(len(msa[0][1]) for msa in raw_batch)
|
| 311 |
+
|
| 312 |
+
tokens = torch.empty(
|
| 313 |
+
(
|
| 314 |
+
batch_size,
|
| 315 |
+
max_alignments,
|
| 316 |
+
max_seqlen + int(self.alphabet.prepend_bos) + int(self.alphabet.append_eos),
|
| 317 |
+
),
|
| 318 |
+
dtype=torch.int64,
|
| 319 |
+
)
|
| 320 |
+
tokens.fill_(self.alphabet.padding_idx)
|
| 321 |
+
labels = []
|
| 322 |
+
strs = []
|
| 323 |
+
|
| 324 |
+
for i, msa in enumerate(raw_batch):
|
| 325 |
+
msa_seqlens = set(len(seq) for _, seq in msa)
|
| 326 |
+
if not len(msa_seqlens) == 1:
|
| 327 |
+
raise RuntimeError(
|
| 328 |
+
"Received unaligned sequences for input to MSA, all sequence "
|
| 329 |
+
"lengths must be equal."
|
| 330 |
+
)
|
| 331 |
+
msa_labels, msa_strs, msa_tokens = super().__call__(msa)
|
| 332 |
+
labels.append(msa_labels)
|
| 333 |
+
strs.append(msa_strs)
|
| 334 |
+
tokens[i, : msa_tokens.size(0), : msa_tokens.size(1)] = msa_tokens
|
| 335 |
+
|
| 336 |
+
return labels, strs, tokens
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def read_fasta(
|
| 340 |
+
path,
|
| 341 |
+
keep_gaps=True,
|
| 342 |
+
keep_insertions=True,
|
| 343 |
+
to_upper=False,
|
| 344 |
+
):
|
| 345 |
+
with open(path, "r") as f:
|
| 346 |
+
for result in read_alignment_lines(
|
| 347 |
+
f, keep_gaps=keep_gaps, keep_insertions=keep_insertions, to_upper=to_upper
|
| 348 |
+
):
|
| 349 |
+
yield result
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def read_alignment_lines(
|
| 353 |
+
lines,
|
| 354 |
+
keep_gaps=True,
|
| 355 |
+
keep_insertions=True,
|
| 356 |
+
to_upper=False,
|
| 357 |
+
):
|
| 358 |
+
seq = desc = None
|
| 359 |
+
|
| 360 |
+
def parse(s):
|
| 361 |
+
if not keep_gaps:
|
| 362 |
+
s = re.sub("-", "", s)
|
| 363 |
+
if not keep_insertions:
|
| 364 |
+
s = re.sub("[a-z]", "", s)
|
| 365 |
+
return s.upper() if to_upper else s
|
| 366 |
+
|
| 367 |
+
for line in lines:
|
| 368 |
+
# Line may be empty if seq % file_line_width == 0
|
| 369 |
+
if len(line) > 0 and line[0] == ">":
|
| 370 |
+
if seq is not None:
|
| 371 |
+
yield desc, parse(seq)
|
| 372 |
+
desc = line.strip().lstrip(">")
|
| 373 |
+
seq = ""
|
| 374 |
+
else:
|
| 375 |
+
assert isinstance(seq, str)
|
| 376 |
+
seq += line.strip()
|
| 377 |
+
assert isinstance(seq, str) and isinstance(desc, str)
|
| 378 |
+
yield desc, parse(seq)
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
class ESMStructuralSplitDataset(torch.utils.data.Dataset):
|
| 382 |
+
"""
|
| 383 |
+
Structural Split Dataset as described in section A.10 of the supplement of our paper.
|
| 384 |
+
https://doi.org/10.1101/622803
|
| 385 |
+
|
| 386 |
+
We use the full version of SCOPe 2.07, clustered at 90% sequence identity,
|
| 387 |
+
generated on January 23, 2020.
|
| 388 |
+
|
| 389 |
+
For each SCOPe domain:
|
| 390 |
+
- We extract the sequence from the corresponding PDB file
|
| 391 |
+
- We extract the 3D coordinates of the Carbon beta atoms, aligning them
|
| 392 |
+
to the sequence. We put NaN where Cb atoms are missing.
|
| 393 |
+
- From the 3D coordinates, we calculate a pairwise distance map, based
|
| 394 |
+
on L2 distance
|
| 395 |
+
- We use DSSP to generate secondary structure labels for the corresponding
|
| 396 |
+
PDB file. This is also aligned to the sequence. We put - where SSP
|
| 397 |
+
labels are missing.
|
| 398 |
+
|
| 399 |
+
For each SCOPe classification level of family/superfamily/fold (in order of difficulty),
|
| 400 |
+
we have split the data into 5 partitions for cross validation. These are provided
|
| 401 |
+
in a downloaded splits folder, in the format:
|
| 402 |
+
splits/{split_level}/{cv_partition}/{train|valid}.txt
|
| 403 |
+
where train is the partition and valid is the concatentation of the remaining 4.
|
| 404 |
+
|
| 405 |
+
For each SCOPe domain, we provide a pkl dump that contains:
|
| 406 |
+
- seq : The domain sequence, stored as an L-length string
|
| 407 |
+
- ssp : The secondary structure labels, stored as an L-length string
|
| 408 |
+
- dist : The distance map, stored as an LxL numpy array
|
| 409 |
+
- coords : The 3D coordinates, stored as an Lx3 numpy array
|
| 410 |
+
|
| 411 |
+
"""
|
| 412 |
+
|
| 413 |
+
base_folder = "structural-data"
|
| 414 |
+
file_list = [
|
| 415 |
+
# url tar filename filename MD5 Hash
|
| 416 |
+
(
|
| 417 |
+
"https://dl.fbaipublicfiles.com/fair-esm/structural-data/splits.tar.gz",
|
| 418 |
+
"splits.tar.gz",
|
| 419 |
+
"splits",
|
| 420 |
+
"456fe1c7f22c9d3d8dfe9735da52411d",
|
| 421 |
+
),
|
| 422 |
+
(
|
| 423 |
+
"https://dl.fbaipublicfiles.com/fair-esm/structural-data/pkl.tar.gz",
|
| 424 |
+
"pkl.tar.gz",
|
| 425 |
+
"pkl",
|
| 426 |
+
"644ea91e56066c750cd50101d390f5db",
|
| 427 |
+
),
|
| 428 |
+
]
|
| 429 |
+
|
| 430 |
+
def __init__(
|
| 431 |
+
self,
|
| 432 |
+
split_level,
|
| 433 |
+
cv_partition,
|
| 434 |
+
split,
|
| 435 |
+
root_path=os.path.expanduser("~/.cache/torch/data/esm"),
|
| 436 |
+
download=False,
|
| 437 |
+
):
|
| 438 |
+
super().__init__()
|
| 439 |
+
assert split in [
|
| 440 |
+
"train",
|
| 441 |
+
"valid",
|
| 442 |
+
], "train_valid must be 'train' or 'valid'"
|
| 443 |
+
self.root_path = root_path
|
| 444 |
+
self.base_path = os.path.join(self.root_path, self.base_folder)
|
| 445 |
+
|
| 446 |
+
# check if root path has what you need or else download it
|
| 447 |
+
if download:
|
| 448 |
+
self.download()
|
| 449 |
+
|
| 450 |
+
self.split_file = os.path.join(
|
| 451 |
+
self.base_path, "splits", split_level, cv_partition, f"{split}.txt"
|
| 452 |
+
)
|
| 453 |
+
self.pkl_dir = os.path.join(self.base_path, "pkl")
|
| 454 |
+
self.names = []
|
| 455 |
+
with open(self.split_file) as f:
|
| 456 |
+
self.names = f.read().splitlines()
|
| 457 |
+
|
| 458 |
+
def __len__(self):
|
| 459 |
+
return len(self.names)
|
| 460 |
+
|
| 461 |
+
def _check_exists(self) -> bool:
|
| 462 |
+
for (_, _, filename, _) in self.file_list:
|
| 463 |
+
fpath = os.path.join(self.base_path, filename)
|
| 464 |
+
if not os.path.exists(fpath) or not os.path.isdir(fpath):
|
| 465 |
+
return False
|
| 466 |
+
return True
|
| 467 |
+
|
| 468 |
+
def download(self):
|
| 469 |
+
|
| 470 |
+
if self._check_exists():
|
| 471 |
+
print("Files already downloaded and verified")
|
| 472 |
+
return
|
| 473 |
+
|
| 474 |
+
from torchvision.datasets.utils import download_url
|
| 475 |
+
|
| 476 |
+
for url, tar_filename, filename, md5_hash in self.file_list:
|
| 477 |
+
download_path = os.path.join(self.base_path, tar_filename)
|
| 478 |
+
download_url(url=url, root=self.base_path, filename=tar_filename, md5=md5_hash)
|
| 479 |
+
shutil.unpack_archive(download_path, self.base_path)
|
| 480 |
+
|
| 481 |
+
def __getitem__(self, idx):
|
| 482 |
+
"""
|
| 483 |
+
Returns a dict with the following entires
|
| 484 |
+
- seq : Str (domain sequence)
|
| 485 |
+
- ssp : Str (SSP labels)
|
| 486 |
+
- dist : np.array (distance map)
|
| 487 |
+
- coords : np.array (3D coordinates)
|
| 488 |
+
"""
|
| 489 |
+
name = self.names[idx]
|
| 490 |
+
pkl_fname = os.path.join(self.pkl_dir, name[1:3], f"{name}.pkl")
|
| 491 |
+
with open(pkl_fname, "rb") as f:
|
| 492 |
+
obj = pickle.load(f)
|
| 493 |
+
return obj
|
esm/source/esm/esmfold/v1/__init__.py
ADDED
|
File without changes
|
esm/source/esm/esmfold/v1/categorical_mixture.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class CategoricalMixture:
|
| 9 |
+
def __init__(self, param, bins=50, start=0, end=1):
|
| 10 |
+
# All tensors are of shape ..., bins.
|
| 11 |
+
self.logits = param
|
| 12 |
+
bins = torch.linspace(
|
| 13 |
+
start, end, bins + 1, device=self.logits.device, dtype=self.logits.dtype
|
| 14 |
+
)
|
| 15 |
+
self.v_bins = (bins[:-1] + bins[1:]) / 2
|
| 16 |
+
|
| 17 |
+
def log_prob(self, true):
|
| 18 |
+
# Shapes are:
|
| 19 |
+
# self.probs: ... x bins
|
| 20 |
+
# true : ...
|
| 21 |
+
true_index = (
|
| 22 |
+
(
|
| 23 |
+
true.unsqueeze(-1)
|
| 24 |
+
- self.v_bins[
|
| 25 |
+
[
|
| 26 |
+
None,
|
| 27 |
+
]
|
| 28 |
+
* true.ndim
|
| 29 |
+
]
|
| 30 |
+
)
|
| 31 |
+
.abs()
|
| 32 |
+
.argmin(-1)
|
| 33 |
+
)
|
| 34 |
+
nll = self.logits.log_softmax(-1)
|
| 35 |
+
return torch.take_along_dim(nll, true_index.unsqueeze(-1), dim=-1).squeeze(-1)
|
| 36 |
+
|
| 37 |
+
def mean(self):
|
| 38 |
+
return (self.logits.softmax(-1) @ self.v_bins.unsqueeze(1)).squeeze(-1)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def categorical_lddt(logits, bins=50):
|
| 42 |
+
# Logits are ..., 37, bins.
|
| 43 |
+
return CategoricalMixture(logits, bins=bins).mean()
|
esm/source/esm/esmfold/v1/esmfold.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
import typing as T
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from functools import partial
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn as nn
|
| 11 |
+
from torch import nn
|
| 12 |
+
from torch.nn import LayerNorm
|
| 13 |
+
|
| 14 |
+
import esm
|
| 15 |
+
from esm import Alphabet
|
| 16 |
+
from esm.esmfold.v1.categorical_mixture import categorical_lddt
|
| 17 |
+
from esm.esmfold.v1.misc import (
|
| 18 |
+
batch_encode_sequences,
|
| 19 |
+
collate_dense_tensors,
|
| 20 |
+
output_to_pdb,
|
| 21 |
+
)
|
| 22 |
+
from esm.esmfold.v1.trunk import FoldingTrunk, FoldingTrunkConfig
|
| 23 |
+
from openfold.data.data_transforms import make_atom14_masks
|
| 24 |
+
from openfold.np import residue_constants
|
| 25 |
+
from openfold.utils.loss import compute_predicted_aligned_error, compute_tm
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class ESMFoldConfig:
|
| 30 |
+
trunk: T.Any = FoldingTrunkConfig()
|
| 31 |
+
lddt_head_hid_dim: int = 128
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
load_fn = esm.pretrained.load_model_and_alphabet
|
| 35 |
+
esm_registry = {
|
| 36 |
+
"esm2_8M": partial(load_fn, "esm2_t6_8M_UR50D_500K"),
|
| 37 |
+
"esm2_8M_270K": esm.pretrained.esm2_t6_8M_UR50D,
|
| 38 |
+
"esm2_35M": partial(load_fn, "esm2_t12_35M_UR50D_500K"),
|
| 39 |
+
"esm2_35M_270K": esm.pretrained.esm2_t12_35M_UR50D,
|
| 40 |
+
"esm2_150M": partial(load_fn, "esm2_t30_150M_UR50D_500K"),
|
| 41 |
+
"esm2_150M_270K": partial(load_fn, "esm2_t30_150M_UR50D_270K"),
|
| 42 |
+
"esm2_650M": esm.pretrained.esm2_t33_650M_UR50D,
|
| 43 |
+
"esm2_650M_270K": partial(load_fn, "esm2_t33_650M_270K_UR50D"),
|
| 44 |
+
"esm2_3B": esm.pretrained.esm2_t36_3B_UR50D,
|
| 45 |
+
"esm2_3B_270K": partial(load_fn, "esm2_t36_3B_UR50D_500K"),
|
| 46 |
+
"esm2_15B": esm.pretrained.esm2_t48_15B_UR50D,
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class ESMFold(nn.Module):
|
| 51 |
+
def __init__(self, esmfold_config=None, **kwargs):
|
| 52 |
+
super().__init__()
|
| 53 |
+
|
| 54 |
+
self.cfg = esmfold_config if esmfold_config else ESMFoldConfig(**kwargs)
|
| 55 |
+
cfg = self.cfg
|
| 56 |
+
|
| 57 |
+
self.distogram_bins = 64
|
| 58 |
+
|
| 59 |
+
self.esm, self.esm_dict = esm_registry.get(cfg.esm_type)()
|
| 60 |
+
|
| 61 |
+
self.esm.requires_grad_(False)
|
| 62 |
+
self.esm.half()
|
| 63 |
+
|
| 64 |
+
self.esm_feats = self.esm.embed_dim
|
| 65 |
+
self.esm_attns = self.esm.num_layers * self.esm.attention_heads
|
| 66 |
+
self.register_buffer("af2_to_esm", ESMFold._af2_to_esm(self.esm_dict))
|
| 67 |
+
self.esm_s_combine = nn.Parameter(torch.zeros(self.esm.num_layers + 1))
|
| 68 |
+
|
| 69 |
+
c_s = cfg.trunk.sequence_state_dim
|
| 70 |
+
c_z = cfg.trunk.pairwise_state_dim
|
| 71 |
+
|
| 72 |
+
self.esm_s_mlp = nn.Sequential(
|
| 73 |
+
LayerNorm(self.esm_feats),
|
| 74 |
+
nn.Linear(self.esm_feats, c_s),
|
| 75 |
+
nn.ReLU(),
|
| 76 |
+
nn.Linear(c_s, c_s),
|
| 77 |
+
)
|
| 78 |
+
if cfg.use_esm_attn_map:
|
| 79 |
+
self.esm_z_mlp = nn.Sequential(
|
| 80 |
+
LayerNorm(self.esm_attns),
|
| 81 |
+
nn.Linear(self.esm_attns, c_z),
|
| 82 |
+
nn.ReLU(),
|
| 83 |
+
nn.Linear(c_z, c_z),
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# 0 is padding, N is unknown residues, N + 1 is mask.
|
| 87 |
+
self.n_tokens_embed = residue_constants.restype_num + 3
|
| 88 |
+
self.pad_idx = 0
|
| 89 |
+
self.unk_idx = self.n_tokens_embed - 2
|
| 90 |
+
self.mask_idx = self.n_tokens_embed - 1
|
| 91 |
+
self.embedding = nn.Embedding(self.n_tokens_embed, c_s, padding_idx=0)
|
| 92 |
+
|
| 93 |
+
self.trunk = FoldingTrunk(**cfg.trunk)
|
| 94 |
+
|
| 95 |
+
self.distogram_head = nn.Linear(c_z, self.distogram_bins)
|
| 96 |
+
self.ptm_head = nn.Linear(c_z, self.distogram_bins)
|
| 97 |
+
self.lm_head = nn.Linear(c_s, self.n_tokens_embed)
|
| 98 |
+
self.lddt_bins = 50
|
| 99 |
+
self.lddt_head = nn.Sequential(
|
| 100 |
+
nn.LayerNorm(cfg.trunk.structure_module.c_s),
|
| 101 |
+
nn.Linear(cfg.trunk.structure_module.c_s, cfg.lddt_head_hid_dim),
|
| 102 |
+
nn.Linear(cfg.lddt_head_hid_dim, cfg.lddt_head_hid_dim),
|
| 103 |
+
nn.Linear(cfg.lddt_head_hid_dim, 37 * self.lddt_bins),
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
@staticmethod
|
| 107 |
+
def _af2_to_esm(d: Alphabet):
|
| 108 |
+
# Remember that t is shifted from residue_constants by 1 (0 is padding).
|
| 109 |
+
esm_reorder = [d.padding_idx] + [
|
| 110 |
+
d.get_idx(v) for v in residue_constants.restypes_with_x
|
| 111 |
+
]
|
| 112 |
+
return torch.tensor(esm_reorder)
|
| 113 |
+
|
| 114 |
+
def _af2_idx_to_esm_idx(self, aa, mask):
|
| 115 |
+
aa = (aa + 1).masked_fill(mask != 1, 0)
|
| 116 |
+
return self.af2_to_esm[aa]
|
| 117 |
+
|
| 118 |
+
def _compute_language_model_representations(
|
| 119 |
+
self, esmaa: torch.Tensor
|
| 120 |
+
) -> torch.Tensor:
|
| 121 |
+
"""Adds bos/eos tokens for the language model, since the structure module doesn't use these."""
|
| 122 |
+
batch_size = esmaa.size(0)
|
| 123 |
+
|
| 124 |
+
bosi, eosi = self.esm_dict.cls_idx, self.esm_dict.eos_idx
|
| 125 |
+
bos = esmaa.new_full((batch_size, 1), bosi)
|
| 126 |
+
eos = esmaa.new_full((batch_size, 1), self.esm_dict.padding_idx)
|
| 127 |
+
esmaa = torch.cat([bos, esmaa, eos], dim=1)
|
| 128 |
+
# Use the first padding index as eos during inference.
|
| 129 |
+
esmaa[range(batch_size), (esmaa != 1).sum(1)] = eosi
|
| 130 |
+
|
| 131 |
+
res = self.esm(
|
| 132 |
+
esmaa,
|
| 133 |
+
repr_layers=range(self.esm.num_layers + 1),
|
| 134 |
+
need_head_weights=self.cfg.use_esm_attn_map,
|
| 135 |
+
)
|
| 136 |
+
esm_s = torch.stack(
|
| 137 |
+
[v for _, v in sorted(res["representations"].items())], dim=2
|
| 138 |
+
)
|
| 139 |
+
esm_s = esm_s[:, 1:-1] # B, L, nLayers, C
|
| 140 |
+
esm_z = (
|
| 141 |
+
res["attentions"].permute(0, 4, 3, 1, 2).flatten(3, 4)[:, 1:-1, 1:-1, :]
|
| 142 |
+
if self.cfg.use_esm_attn_map
|
| 143 |
+
else None
|
| 144 |
+
)
|
| 145 |
+
return esm_s, esm_z
|
| 146 |
+
|
| 147 |
+
def _mask_inputs_to_esm(self, esmaa, pattern):
|
| 148 |
+
new_esmaa = esmaa.clone()
|
| 149 |
+
new_esmaa[pattern == 1] = self.esm_dict.mask_idx
|
| 150 |
+
return new_esmaa
|
| 151 |
+
|
| 152 |
+
def forward(
|
| 153 |
+
self,
|
| 154 |
+
aa: torch.Tensor,
|
| 155 |
+
mask: T.Optional[torch.Tensor] = None,
|
| 156 |
+
residx: T.Optional[torch.Tensor] = None,
|
| 157 |
+
masking_pattern: T.Optional[torch.Tensor] = None,
|
| 158 |
+
num_recycles: T.Optional[int] = None,
|
| 159 |
+
):
|
| 160 |
+
"""Runs a forward pass given input tokens. Use `model.infer` to
|
| 161 |
+
run inference from a sequence.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
aa (torch.Tensor): Tensor containing indices corresponding to amino acids. Indices match
|
| 165 |
+
openfold.np.residue_constants.restype_order_with_x.
|
| 166 |
+
mask (torch.Tensor): Binary tensor with 1 meaning position is unmasked and 0 meaning position is masked.
|
| 167 |
+
residx (torch.Tensor): Residue indices of amino acids. Will assume contiguous if not provided.
|
| 168 |
+
masking_pattern (torch.Tensor): Optional masking to pass to the input. Binary tensor of the same size
|
| 169 |
+
as `aa`. Positions with 1 will be masked. ESMFold sometimes produces different samples when
|
| 170 |
+
different masks are provided.
|
| 171 |
+
num_recycles (int): How many recycle iterations to perform. If None, defaults to training max
|
| 172 |
+
recycles, which is 3.
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
if mask is None:
|
| 176 |
+
mask = torch.ones_like(aa)
|
| 177 |
+
|
| 178 |
+
B = aa.shape[0]
|
| 179 |
+
L = aa.shape[1]
|
| 180 |
+
device = aa.device
|
| 181 |
+
|
| 182 |
+
if residx is None:
|
| 183 |
+
residx = torch.arange(L, device=device).expand_as(aa)
|
| 184 |
+
|
| 185 |
+
# === ESM ===
|
| 186 |
+
esmaa = self._af2_idx_to_esm_idx(aa, mask)
|
| 187 |
+
|
| 188 |
+
if masking_pattern is not None:
|
| 189 |
+
esmaa = self._mask_inputs_to_esm(esmaa, masking_pattern)
|
| 190 |
+
|
| 191 |
+
esm_s, esm_z = self._compute_language_model_representations(esmaa)
|
| 192 |
+
|
| 193 |
+
# Convert esm_s to the precision used by the trunk and
|
| 194 |
+
# the structure module. These tensors may be a lower precision if, for example,
|
| 195 |
+
# we're running the language model in fp16 precision.
|
| 196 |
+
esm_s = esm_s.to(self.esm_s_combine.dtype)
|
| 197 |
+
esm_s = esm_s.detach()
|
| 198 |
+
|
| 199 |
+
# === preprocessing ===
|
| 200 |
+
esm_s = (self.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s).squeeze(2)
|
| 201 |
+
|
| 202 |
+
s_s_0 = self.esm_s_mlp(esm_s)
|
| 203 |
+
if self.cfg.use_esm_attn_map:
|
| 204 |
+
esm_z = esm_z.to(self.esm_s_combine.dtype)
|
| 205 |
+
esm_z = esm_z.detach()
|
| 206 |
+
s_z_0 = self.esm_z_mlp(esm_z)
|
| 207 |
+
else:
|
| 208 |
+
s_z_0 = s_s_0.new_zeros(B, L, L, self.cfg.trunk.pairwise_state_dim)
|
| 209 |
+
|
| 210 |
+
s_s_0 += self.embedding(aa)
|
| 211 |
+
|
| 212 |
+
structure: dict = self.trunk(
|
| 213 |
+
s_s_0, s_z_0, aa, residx, mask, no_recycles=num_recycles
|
| 214 |
+
)
|
| 215 |
+
# Documenting what we expect:
|
| 216 |
+
structure = {
|
| 217 |
+
k: v
|
| 218 |
+
for k, v in structure.items()
|
| 219 |
+
if k
|
| 220 |
+
in [
|
| 221 |
+
"s_z",
|
| 222 |
+
"s_s",
|
| 223 |
+
"frames",
|
| 224 |
+
"sidechain_frames",
|
| 225 |
+
"unnormalized_angles",
|
| 226 |
+
"angles",
|
| 227 |
+
"positions",
|
| 228 |
+
"states",
|
| 229 |
+
]
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
disto_logits = self.distogram_head(structure["s_z"])
|
| 233 |
+
disto_logits = (disto_logits + disto_logits.transpose(1, 2)) / 2
|
| 234 |
+
structure["distogram_logits"] = disto_logits
|
| 235 |
+
|
| 236 |
+
lm_logits = self.lm_head(structure["s_s"])
|
| 237 |
+
structure["lm_logits"] = lm_logits
|
| 238 |
+
|
| 239 |
+
structure["aatype"] = aa
|
| 240 |
+
make_atom14_masks(structure)
|
| 241 |
+
|
| 242 |
+
for k in [
|
| 243 |
+
"atom14_atom_exists",
|
| 244 |
+
"atom37_atom_exists",
|
| 245 |
+
]:
|
| 246 |
+
structure[k] *= mask.unsqueeze(-1)
|
| 247 |
+
structure["residue_index"] = residx
|
| 248 |
+
|
| 249 |
+
lddt_head = self.lddt_head(structure["states"]).reshape(
|
| 250 |
+
structure["states"].shape[0], B, L, -1, self.lddt_bins
|
| 251 |
+
)
|
| 252 |
+
structure["lddt_head"] = lddt_head
|
| 253 |
+
plddt = categorical_lddt(lddt_head[-1], bins=self.lddt_bins)
|
| 254 |
+
structure["plddt"] = (
|
| 255 |
+
100 * plddt
|
| 256 |
+
) # we predict plDDT between 0 and 1, scale to be between 0 and 100.
|
| 257 |
+
|
| 258 |
+
ptm_logits = self.ptm_head(structure["s_z"])
|
| 259 |
+
|
| 260 |
+
seqlen = mask.type(torch.int64).sum(1)
|
| 261 |
+
structure["ptm_logits"] = ptm_logits
|
| 262 |
+
structure["ptm"] = torch.stack(
|
| 263 |
+
[
|
| 264 |
+
compute_tm(
|
| 265 |
+
batch_ptm_logits[None, :sl, :sl],
|
| 266 |
+
max_bins=31,
|
| 267 |
+
no_bins=self.distogram_bins,
|
| 268 |
+
)
|
| 269 |
+
for batch_ptm_logits, sl in zip(ptm_logits, seqlen)
|
| 270 |
+
]
|
| 271 |
+
)
|
| 272 |
+
structure.update(
|
| 273 |
+
compute_predicted_aligned_error(
|
| 274 |
+
ptm_logits, max_bin=31, no_bins=self.distogram_bins
|
| 275 |
+
)
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
return structure
|
| 279 |
+
|
| 280 |
+
@torch.no_grad()
|
| 281 |
+
def infer(
|
| 282 |
+
self,
|
| 283 |
+
sequences: T.Union[str, T.List[str]],
|
| 284 |
+
residx=None,
|
| 285 |
+
masking_pattern: T.Optional[torch.Tensor] = None,
|
| 286 |
+
num_recycles: T.Optional[int] = None,
|
| 287 |
+
residue_index_offset: T.Optional[int] = 512,
|
| 288 |
+
chain_linker: T.Optional[str] = "G" * 25,
|
| 289 |
+
):
|
| 290 |
+
"""Runs a forward pass given input sequences.
|
| 291 |
+
|
| 292 |
+
Args:
|
| 293 |
+
sequences (Union[str, List[str]]): A list of sequences to make predictions for. Multimers can also be passed in,
|
| 294 |
+
each chain should be separated by a ':' token (e.g. "<chain1>:<chain2>:<chain3>").
|
| 295 |
+
residx (torch.Tensor): Residue indices of amino acids. Will assume contiguous if not provided.
|
| 296 |
+
masking_pattern (torch.Tensor): Optional masking to pass to the input. Binary tensor of the same size
|
| 297 |
+
as `aa`. Positions with 1 will be masked. ESMFold sometimes produces different samples when
|
| 298 |
+
different masks are provided.
|
| 299 |
+
num_recycles (int): How many recycle iterations to perform. If None, defaults to training max
|
| 300 |
+
recycles (cfg.trunk.max_recycles), which is 4.
|
| 301 |
+
residue_index_offset (int): Residue index separation between chains if predicting a multimer. Has no effect on
|
| 302 |
+
single chain predictions. Default: 512.
|
| 303 |
+
chain_linker (str): Linker to use between chains if predicting a multimer. Has no effect on single chain
|
| 304 |
+
predictions. Default: length-25 poly-G ("G" * 25).
|
| 305 |
+
"""
|
| 306 |
+
if isinstance(sequences, str):
|
| 307 |
+
sequences = [sequences]
|
| 308 |
+
|
| 309 |
+
aatype, mask, _residx, linker_mask, chain_index = batch_encode_sequences(
|
| 310 |
+
sequences, residue_index_offset, chain_linker
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
if residx is None:
|
| 314 |
+
residx = _residx
|
| 315 |
+
elif not isinstance(residx, torch.Tensor):
|
| 316 |
+
residx = collate_dense_tensors(residx)
|
| 317 |
+
|
| 318 |
+
aatype, mask, residx, linker_mask = map(
|
| 319 |
+
lambda x: x.to(self.device), (aatype, mask, residx, linker_mask)
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
output = self.forward(
|
| 323 |
+
aatype,
|
| 324 |
+
mask=mask,
|
| 325 |
+
residx=residx,
|
| 326 |
+
masking_pattern=masking_pattern,
|
| 327 |
+
num_recycles=num_recycles,
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
output["atom37_atom_exists"] = output[
|
| 331 |
+
"atom37_atom_exists"
|
| 332 |
+
] * linker_mask.unsqueeze(2)
|
| 333 |
+
|
| 334 |
+
output["mean_plddt"] = (output["plddt"] * output["atom37_atom_exists"]).sum(
|
| 335 |
+
dim=(1, 2)
|
| 336 |
+
) / output["atom37_atom_exists"].sum(dim=(1, 2))
|
| 337 |
+
output["chain_index"] = chain_index
|
| 338 |
+
|
| 339 |
+
return output
|
| 340 |
+
|
| 341 |
+
def output_to_pdb(self, output: T.Dict) -> T.List[str]:
|
| 342 |
+
"""Returns the pbd (file) string from the model given the model output."""
|
| 343 |
+
return output_to_pdb(output)
|
| 344 |
+
|
| 345 |
+
def infer_pdbs(self, seqs: T.List[str], *args, **kwargs) -> T.List[str]:
|
| 346 |
+
"""Returns list of pdb (files) strings from the model given a list of input sequences."""
|
| 347 |
+
output = self.infer(seqs, *args, **kwargs)
|
| 348 |
+
return self.output_to_pdb(output)
|
| 349 |
+
|
| 350 |
+
def infer_pdb(self, sequence: str, *args, **kwargs) -> str:
|
| 351 |
+
"""Returns the pdb (file) string from the model given an input sequence."""
|
| 352 |
+
return self.infer_pdbs([sequence], *args, **kwargs)[0]
|
| 353 |
+
|
| 354 |
+
def set_chunk_size(self, chunk_size: T.Optional[int]):
|
| 355 |
+
# This parameter means the axial attention will be computed
|
| 356 |
+
# in a chunked manner. This should make the memory used more or less O(L) instead of O(L^2).
|
| 357 |
+
# It's equivalent to running a for loop over chunks of the dimension we're iterative over,
|
| 358 |
+
# where the chunk_size is the size of the chunks, so 128 would mean to parse 128-lengthed chunks.
|
| 359 |
+
# Setting the value to None will return to default behavior, disable chunking.
|
| 360 |
+
self.trunk.set_chunk_size(chunk_size)
|
| 361 |
+
|
| 362 |
+
@property
|
| 363 |
+
def device(self):
|
| 364 |
+
return self.esm_s_combine.device
|
esm/source/esm/esmfold/v1/misc.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
import typing as T
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
from einops import rearrange, repeat
|
| 11 |
+
from torch import nn
|
| 12 |
+
from openfold.np import residue_constants
|
| 13 |
+
from openfold.np.protein import Protein as OFProtein
|
| 14 |
+
from openfold.np.protein import to_pdb
|
| 15 |
+
from openfold.utils.feats import atom14_to_atom37
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def encode_sequence(
|
| 19 |
+
seq: str,
|
| 20 |
+
residue_index_offset: T.Optional[int] = 512,
|
| 21 |
+
chain_linker: T.Optional[str] = "G" * 25,
|
| 22 |
+
) -> T.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 23 |
+
if chain_linker is None:
|
| 24 |
+
chain_linker = ""
|
| 25 |
+
if residue_index_offset is None:
|
| 26 |
+
residue_index_offset = 0
|
| 27 |
+
|
| 28 |
+
chains = seq.split(":")
|
| 29 |
+
seq = chain_linker.join(chains)
|
| 30 |
+
|
| 31 |
+
unk_idx = residue_constants.restype_order_with_x["X"]
|
| 32 |
+
encoded = torch.tensor(
|
| 33 |
+
[residue_constants.restype_order_with_x.get(aa, unk_idx) for aa in seq]
|
| 34 |
+
)
|
| 35 |
+
residx = torch.arange(len(encoded))
|
| 36 |
+
|
| 37 |
+
if residue_index_offset > 0:
|
| 38 |
+
start = 0
|
| 39 |
+
for i, chain in enumerate(chains):
|
| 40 |
+
residx[start : start + len(chain) + len(chain_linker)] += (
|
| 41 |
+
i * residue_index_offset
|
| 42 |
+
)
|
| 43 |
+
start += len(chain) + len(chain_linker)
|
| 44 |
+
|
| 45 |
+
linker_mask = torch.ones_like(encoded, dtype=torch.float32)
|
| 46 |
+
chain_index = []
|
| 47 |
+
offset = 0
|
| 48 |
+
for i, chain in enumerate(chains):
|
| 49 |
+
if i > 0:
|
| 50 |
+
chain_index.extend([i - 1] * len(chain_linker))
|
| 51 |
+
chain_index.extend([i] * len(chain))
|
| 52 |
+
offset += len(chain)
|
| 53 |
+
linker_mask[offset : offset + len(chain_linker)] = 0
|
| 54 |
+
offset += len(chain_linker)
|
| 55 |
+
|
| 56 |
+
chain_index = torch.tensor(chain_index, dtype=torch.int64)
|
| 57 |
+
|
| 58 |
+
return encoded, residx, linker_mask, chain_index
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def batch_encode_sequences(
|
| 62 |
+
sequences: T.Sequence[str],
|
| 63 |
+
residue_index_offset: T.Optional[int] = 512,
|
| 64 |
+
chain_linker: T.Optional[str] = "G" * 25,
|
| 65 |
+
) -> T.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 66 |
+
|
| 67 |
+
aatype_list = []
|
| 68 |
+
residx_list = []
|
| 69 |
+
linker_mask_list = []
|
| 70 |
+
chain_index_list = []
|
| 71 |
+
for seq in sequences:
|
| 72 |
+
aatype_seq, residx_seq, linker_mask_seq, chain_index_seq = encode_sequence(
|
| 73 |
+
seq,
|
| 74 |
+
residue_index_offset=residue_index_offset,
|
| 75 |
+
chain_linker=chain_linker,
|
| 76 |
+
)
|
| 77 |
+
aatype_list.append(aatype_seq)
|
| 78 |
+
residx_list.append(residx_seq)
|
| 79 |
+
linker_mask_list.append(linker_mask_seq)
|
| 80 |
+
chain_index_list.append(chain_index_seq)
|
| 81 |
+
|
| 82 |
+
aatype = collate_dense_tensors(aatype_list)
|
| 83 |
+
mask = collate_dense_tensors(
|
| 84 |
+
[aatype.new_ones(len(aatype_seq)) for aatype_seq in aatype_list]
|
| 85 |
+
)
|
| 86 |
+
residx = collate_dense_tensors(residx_list)
|
| 87 |
+
linker_mask = collate_dense_tensors(linker_mask_list)
|
| 88 |
+
chain_index_list = collate_dense_tensors(chain_index_list, -1)
|
| 89 |
+
|
| 90 |
+
return aatype, mask, residx, linker_mask, chain_index_list
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def output_to_pdb(output: T.Dict) -> T.List[str]:
|
| 94 |
+
"""Returns the pbd (file) string from the model given the model output."""
|
| 95 |
+
# atom14_to_atom37 must be called first, as it fails on latest numpy if the
|
| 96 |
+
# input is a numpy array. It will work if the input is a torch tensor.
|
| 97 |
+
final_atom_positions = atom14_to_atom37(output["positions"][-1], output)
|
| 98 |
+
output = {k: v.to("cpu").numpy() for k, v in output.items()}
|
| 99 |
+
final_atom_positions = final_atom_positions.cpu().numpy()
|
| 100 |
+
final_atom_mask = output["atom37_atom_exists"]
|
| 101 |
+
pdbs = []
|
| 102 |
+
for i in range(output["aatype"].shape[0]):
|
| 103 |
+
aa = output["aatype"][i]
|
| 104 |
+
pred_pos = final_atom_positions[i]
|
| 105 |
+
mask = final_atom_mask[i]
|
| 106 |
+
resid = output["residue_index"][i] + 1
|
| 107 |
+
pred = OFProtein(
|
| 108 |
+
aatype=aa,
|
| 109 |
+
atom_positions=pred_pos,
|
| 110 |
+
atom_mask=mask,
|
| 111 |
+
residue_index=resid,
|
| 112 |
+
b_factors=output["plddt"][i],
|
| 113 |
+
chain_index=output["chain_index"][i] if "chain_index" in output else None,
|
| 114 |
+
)
|
| 115 |
+
pdbs.append(to_pdb(pred))
|
| 116 |
+
return pdbs
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def collate_dense_tensors(
|
| 120 |
+
samples: T.List[torch.Tensor], pad_v: float = 0
|
| 121 |
+
) -> torch.Tensor:
|
| 122 |
+
"""
|
| 123 |
+
Takes a list of tensors with the following dimensions:
|
| 124 |
+
[(d_11, ..., d_1K),
|
| 125 |
+
(d_21, ..., d_2K),
|
| 126 |
+
...,
|
| 127 |
+
(d_N1, ..., d_NK)]
|
| 128 |
+
and stack + pads them into a single tensor of:
|
| 129 |
+
(N, max_i=1,N { d_i1 }, ..., max_i=1,N {diK})
|
| 130 |
+
"""
|
| 131 |
+
if len(samples) == 0:
|
| 132 |
+
return torch.Tensor()
|
| 133 |
+
if len(set(x.dim() for x in samples)) != 1:
|
| 134 |
+
raise RuntimeError(
|
| 135 |
+
f"Samples has varying dimensions: {[x.dim() for x in samples]}"
|
| 136 |
+
)
|
| 137 |
+
(device,) = tuple(set(x.device for x in samples)) # assumes all on same device
|
| 138 |
+
max_shape = [max(lst) for lst in zip(*[x.shape for x in samples])]
|
| 139 |
+
result = torch.empty(
|
| 140 |
+
len(samples), *max_shape, dtype=samples[0].dtype, device=device
|
| 141 |
+
)
|
| 142 |
+
result.fill_(pad_v)
|
| 143 |
+
for i in range(len(samples)):
|
| 144 |
+
result_i = result[i]
|
| 145 |
+
t = samples[i]
|
| 146 |
+
result_i[tuple(slice(0, k) for k in t.shape)] = t
|
| 147 |
+
return result
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
class Attention(nn.Module):
|
| 151 |
+
def __init__(self, embed_dim, num_heads, head_width, gated=False):
|
| 152 |
+
super().__init__()
|
| 153 |
+
assert embed_dim == num_heads * head_width
|
| 154 |
+
|
| 155 |
+
self.embed_dim = embed_dim
|
| 156 |
+
self.num_heads = num_heads
|
| 157 |
+
self.head_width = head_width
|
| 158 |
+
|
| 159 |
+
self.proj = nn.Linear(embed_dim, embed_dim * 3, bias=False)
|
| 160 |
+
self.o_proj = nn.Linear(embed_dim, embed_dim, bias=True)
|
| 161 |
+
self.gated = gated
|
| 162 |
+
if gated:
|
| 163 |
+
self.g_proj = nn.Linear(embed_dim, embed_dim)
|
| 164 |
+
torch.nn.init.zeros_(self.g_proj.weight)
|
| 165 |
+
torch.nn.init.ones_(self.g_proj.bias)
|
| 166 |
+
|
| 167 |
+
self.rescale_factor = self.head_width**-0.5
|
| 168 |
+
|
| 169 |
+
torch.nn.init.zeros_(self.o_proj.bias)
|
| 170 |
+
|
| 171 |
+
def forward(self, x, mask=None, bias=None, indices=None):
|
| 172 |
+
"""
|
| 173 |
+
Basic self attention with optional mask and external pairwise bias.
|
| 174 |
+
To handle sequences of different lengths, use mask.
|
| 175 |
+
|
| 176 |
+
Inputs:
|
| 177 |
+
x: batch of input sequneces (.. x L x C)
|
| 178 |
+
mask: batch of boolean masks where 1=valid, 0=padding position (.. x L_k). optional.
|
| 179 |
+
bias: batch of scalar pairwise attention biases (.. x Lq x Lk x num_heads). optional.
|
| 180 |
+
|
| 181 |
+
Outputs:
|
| 182 |
+
sequence projection (B x L x embed_dim), attention maps (B x L x L x num_heads)
|
| 183 |
+
"""
|
| 184 |
+
|
| 185 |
+
t = rearrange(self.proj(x), "... l (h c) -> ... h l c", h=self.num_heads)
|
| 186 |
+
q, k, v = t.chunk(3, dim=-1)
|
| 187 |
+
|
| 188 |
+
q = self.rescale_factor * q
|
| 189 |
+
a = torch.einsum("...qc,...kc->...qk", q, k)
|
| 190 |
+
|
| 191 |
+
# Add external attention bias.
|
| 192 |
+
if bias is not None:
|
| 193 |
+
a = a + rearrange(bias, "... lq lk h -> ... h lq lk")
|
| 194 |
+
|
| 195 |
+
# Do not attend to padding tokens.
|
| 196 |
+
if mask is not None:
|
| 197 |
+
mask = repeat(
|
| 198 |
+
mask, "... lk -> ... h lq lk", h=self.num_heads, lq=q.shape[-2]
|
| 199 |
+
)
|
| 200 |
+
a = a.masked_fill(mask == False, -np.inf)
|
| 201 |
+
|
| 202 |
+
a = F.softmax(a, dim=-1)
|
| 203 |
+
|
| 204 |
+
y = torch.einsum("...hqk,...hkc->...qhc", a, v)
|
| 205 |
+
y = rearrange(y, "... h c -> ... (h c)", h=self.num_heads)
|
| 206 |
+
|
| 207 |
+
if self.gated:
|
| 208 |
+
y = self.g_proj(x).sigmoid() * y
|
| 209 |
+
y = self.o_proj(y)
|
| 210 |
+
|
| 211 |
+
return y, rearrange(a, "... lq lk h -> ... h lq lk")
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
class Dropout(nn.Module):
|
| 215 |
+
"""
|
| 216 |
+
Implementation of dropout with the ability to share the dropout mask
|
| 217 |
+
along a particular dimension.
|
| 218 |
+
"""
|
| 219 |
+
|
| 220 |
+
def __init__(self, r: float, batch_dim: T.Union[int, T.List[int]]):
|
| 221 |
+
super(Dropout, self).__init__()
|
| 222 |
+
|
| 223 |
+
self.r = r
|
| 224 |
+
if type(batch_dim) == int:
|
| 225 |
+
batch_dim = [batch_dim]
|
| 226 |
+
self.batch_dim = batch_dim
|
| 227 |
+
self.dropout = nn.Dropout(self.r)
|
| 228 |
+
|
| 229 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 230 |
+
shape = list(x.shape)
|
| 231 |
+
if self.batch_dim is not None:
|
| 232 |
+
for bd in self.batch_dim:
|
| 233 |
+
shape[bd] = 1
|
| 234 |
+
return x * self.dropout(x.new_ones(shape))
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
class SequenceToPair(nn.Module):
|
| 238 |
+
def __init__(self, sequence_state_dim, inner_dim, pairwise_state_dim):
|
| 239 |
+
super().__init__()
|
| 240 |
+
|
| 241 |
+
self.layernorm = nn.LayerNorm(sequence_state_dim)
|
| 242 |
+
self.proj = nn.Linear(sequence_state_dim, inner_dim * 2, bias=True)
|
| 243 |
+
self.o_proj = nn.Linear(2 * inner_dim, pairwise_state_dim, bias=True)
|
| 244 |
+
|
| 245 |
+
torch.nn.init.zeros_(self.proj.bias)
|
| 246 |
+
torch.nn.init.zeros_(self.o_proj.bias)
|
| 247 |
+
|
| 248 |
+
def forward(self, sequence_state):
|
| 249 |
+
"""
|
| 250 |
+
Inputs:
|
| 251 |
+
sequence_state: B x L x sequence_state_dim
|
| 252 |
+
|
| 253 |
+
Output:
|
| 254 |
+
pairwise_state: B x L x L x pairwise_state_dim
|
| 255 |
+
|
| 256 |
+
Intermediate state:
|
| 257 |
+
B x L x L x 2*inner_dim
|
| 258 |
+
"""
|
| 259 |
+
|
| 260 |
+
assert len(sequence_state.shape) == 3
|
| 261 |
+
|
| 262 |
+
s = self.layernorm(sequence_state)
|
| 263 |
+
s = self.proj(s)
|
| 264 |
+
q, k = s.chunk(2, dim=-1)
|
| 265 |
+
|
| 266 |
+
prod = q[:, None, :, :] * k[:, :, None, :]
|
| 267 |
+
diff = q[:, None, :, :] - k[:, :, None, :]
|
| 268 |
+
|
| 269 |
+
x = torch.cat([prod, diff], dim=-1)
|
| 270 |
+
x = self.o_proj(x)
|
| 271 |
+
|
| 272 |
+
return x
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
class PairToSequence(nn.Module):
|
| 276 |
+
def __init__(self, pairwise_state_dim, num_heads):
|
| 277 |
+
super().__init__()
|
| 278 |
+
|
| 279 |
+
self.layernorm = nn.LayerNorm(pairwise_state_dim)
|
| 280 |
+
self.linear = nn.Linear(pairwise_state_dim, num_heads, bias=False)
|
| 281 |
+
|
| 282 |
+
def forward(self, pairwise_state):
|
| 283 |
+
"""
|
| 284 |
+
Inputs:
|
| 285 |
+
pairwise_state: B x L x L x pairwise_state_dim
|
| 286 |
+
|
| 287 |
+
Output:
|
| 288 |
+
pairwise_bias: B x L x L x num_heads
|
| 289 |
+
"""
|
| 290 |
+
assert len(pairwise_state.shape) == 4
|
| 291 |
+
z = self.layernorm(pairwise_state)
|
| 292 |
+
pairwise_bias = self.linear(z)
|
| 293 |
+
return pairwise_bias
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
class ResidueMLP(nn.Module):
|
| 297 |
+
def __init__(self, embed_dim, inner_dim, norm=nn.LayerNorm, dropout=0):
|
| 298 |
+
super().__init__()
|
| 299 |
+
|
| 300 |
+
self.mlp = nn.Sequential(
|
| 301 |
+
norm(embed_dim),
|
| 302 |
+
nn.Linear(embed_dim, inner_dim),
|
| 303 |
+
nn.ReLU(),
|
| 304 |
+
nn.Linear(inner_dim, embed_dim),
|
| 305 |
+
nn.Dropout(dropout),
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
def forward(self, x):
|
| 309 |
+
return x + self.mlp(x)
|
esm/source/esm/esmfold/v1/pretrained.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
from esm.esmfold.v1.esmfold import ESMFold
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _load_model(model_name):
|
| 14 |
+
if model_name.endswith(".pt"): # local, treat as filepath
|
| 15 |
+
model_path = Path(model_name)
|
| 16 |
+
model_data = torch.load(str(model_path), map_location="cpu")
|
| 17 |
+
else: # load from hub
|
| 18 |
+
url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
|
| 19 |
+
model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location="cpu")
|
| 20 |
+
|
| 21 |
+
cfg = model_data["cfg"]["model"]
|
| 22 |
+
model_state = model_data["model"]
|
| 23 |
+
model = ESMFold(esmfold_config=cfg)
|
| 24 |
+
|
| 25 |
+
expected_keys = set(model.state_dict().keys())
|
| 26 |
+
found_keys = set(model_state.keys())
|
| 27 |
+
|
| 28 |
+
missing_essential_keys = []
|
| 29 |
+
for missing_key in expected_keys - found_keys:
|
| 30 |
+
if not missing_key.startswith("esm."):
|
| 31 |
+
missing_essential_keys.append(missing_key)
|
| 32 |
+
|
| 33 |
+
if missing_essential_keys:
|
| 34 |
+
raise RuntimeError(f"Keys '{', '.join(missing_essential_keys)}' are missing.")
|
| 35 |
+
|
| 36 |
+
model.load_state_dict(model_state, strict=False)
|
| 37 |
+
|
| 38 |
+
return model
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def esmfold_v0():
|
| 42 |
+
"""
|
| 43 |
+
ESMFold v0 model with 3B ESM-2, 48 folding blocks.
|
| 44 |
+
This version was used for the paper (Lin et al, 2022). It was trained
|
| 45 |
+
on all PDB chains until 2020-05, to ensure temporal holdout with CASP14
|
| 46 |
+
and the CAMEO validation and test set reported there.
|
| 47 |
+
"""
|
| 48 |
+
return _load_model("esmfold_3B_v0")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def esmfold_v1():
|
| 52 |
+
"""
|
| 53 |
+
ESMFold v1 model using 3B ESM-2, 48 folding blocks.
|
| 54 |
+
ESMFold provides fast high accuracy atomic level structure prediction
|
| 55 |
+
directly from the individual sequence of a protein. ESMFold uses the ESM2
|
| 56 |
+
protein language model to extract meaningful representations from the
|
| 57 |
+
protein sequence.
|
| 58 |
+
"""
|
| 59 |
+
return _load_model("esmfold_3B_v1")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def esmfold_structure_module_only_8M():
|
| 63 |
+
"""
|
| 64 |
+
ESMFold baseline model using 8M ESM-2, 0 folding blocks.
|
| 65 |
+
ESM-2 here is trained out to 500K updates.
|
| 66 |
+
This is a model designed to test the capabilities of the language model
|
| 67 |
+
when ablated for number of parameters in the language model.
|
| 68 |
+
See table S1 in (Lin et al, 2022).
|
| 69 |
+
"""
|
| 70 |
+
return _load_model("esmfold_structure_module_only_8M")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def esmfold_structure_module_only_8M_270K():
|
| 74 |
+
"""
|
| 75 |
+
ESMFold baseline model using 8M ESM-2, 0 folding blocks.
|
| 76 |
+
ESM-2 here is trained out to 270K updates.
|
| 77 |
+
This is a model designed to test the capabilities of the language model
|
| 78 |
+
when ablated for number of parameters in the language model.
|
| 79 |
+
See table S1 in (Lin et al, 2022).
|
| 80 |
+
"""
|
| 81 |
+
return _load_model("esmfold_structure_module_only_8M_270K")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def esmfold_structure_module_only_35M():
|
| 85 |
+
"""
|
| 86 |
+
ESMFold baseline model using 35M ESM-2, 0 folding blocks.
|
| 87 |
+
ESM-2 here is trained out to 500K updates.
|
| 88 |
+
This is a model designed to test the capabilities of the language model
|
| 89 |
+
when ablated for number of parameters in the language model.
|
| 90 |
+
See table S1 in (Lin et al, 2022).
|
| 91 |
+
"""
|
| 92 |
+
return _load_model("esmfold_structure_module_only_35M")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def esmfold_structure_module_only_35M_270K():
|
| 96 |
+
"""
|
| 97 |
+
ESMFold baseline model using 35M ESM-2, 0 folding blocks.
|
| 98 |
+
ESM-2 here is trained out to 270K updates.
|
| 99 |
+
This is a model designed to test the capabilities of the language model
|
| 100 |
+
when ablated for number of parameters in the language model.
|
| 101 |
+
See table S1 in (Lin et al, 2022).
|
| 102 |
+
"""
|
| 103 |
+
return _load_model("esmfold_structure_module_only_35M_270K")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def esmfold_structure_module_only_150M():
|
| 107 |
+
"""
|
| 108 |
+
ESMFold baseline model using 150M ESM-2, 0 folding blocks.
|
| 109 |
+
ESM-2 here is trained out to 500K updates.
|
| 110 |
+
This is a model designed to test the capabilities of the language model
|
| 111 |
+
when ablated for number of parameters in the language model.
|
| 112 |
+
See table S1 in (Lin et al, 2022).
|
| 113 |
+
"""
|
| 114 |
+
return _load_model("esmfold_structure_module_only_150M")
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def esmfold_structure_module_only_150M_270K():
|
| 118 |
+
"""
|
| 119 |
+
ESMFold baseline model using 150M ESM-2, 0 folding blocks.
|
| 120 |
+
ESM-2 here is trained out to 270K updates.
|
| 121 |
+
This is a model designed to test the capabilities of the language model
|
| 122 |
+
when ablated for number of parameters in the language model.
|
| 123 |
+
See table S1 in (Lin et al, 2022).
|
| 124 |
+
"""
|
| 125 |
+
return _load_model("esmfold_structure_module_only_150M_270K")
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def esmfold_structure_module_only_650M():
|
| 129 |
+
"""
|
| 130 |
+
ESMFold baseline model using 650M ESM-2, 0 folding blocks.
|
| 131 |
+
ESM-2 here is trained out to 500K updates.
|
| 132 |
+
This is a model designed to test the capabilities of the language model
|
| 133 |
+
when ablated for number of parameters in the language model.
|
| 134 |
+
See table S1 in (Lin et al, 2022).
|
| 135 |
+
"""
|
| 136 |
+
return _load_model("esmfold_structure_module_only_650M")
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def esmfold_structure_module_only_650M_270K():
|
| 140 |
+
"""
|
| 141 |
+
ESMFold baseline model using 650M ESM-2, 0 folding blocks.
|
| 142 |
+
ESM-2 here is trained out to 270K updates.
|
| 143 |
+
This is a model designed to test the capabilities of the language model
|
| 144 |
+
when ablated for number of parameters in the language model.
|
| 145 |
+
See table S1 in (Lin et al, 2022).
|
| 146 |
+
"""
|
| 147 |
+
return _load_model("esmfold_structure_module_only_650M_270K")
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def esmfold_structure_module_only_3B():
|
| 151 |
+
"""
|
| 152 |
+
ESMFold baseline model using 3B ESM-2, 0 folding blocks.
|
| 153 |
+
ESM-2 here is trained out to 500K updates.
|
| 154 |
+
This is a model designed to test the capabilities of the language model
|
| 155 |
+
when ablated for number of parameters in the language model.
|
| 156 |
+
See table S1 in (Lin et al, 2022).
|
| 157 |
+
"""
|
| 158 |
+
return _load_model("esmfold_structure_module_only_3B")
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def esmfold_structure_module_only_3B_270K():
|
| 162 |
+
"""
|
| 163 |
+
ESMFold baseline model using 3B ESM-2, 0 folding blocks.
|
| 164 |
+
ESM-2 here is trained out to 270K updates.
|
| 165 |
+
This is a model designed to test the capabilities of the language model
|
| 166 |
+
when ablated for number of parameters in the language model.
|
| 167 |
+
See table S1 in (Lin et al, 2022).
|
| 168 |
+
"""
|
| 169 |
+
return _load_model("esmfold_structure_module_only_3B_270K")
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def esmfold_structure_module_only_15B():
|
| 173 |
+
"""
|
| 174 |
+
ESMFold baseline model using 15B ESM-2, 0 folding blocks.
|
| 175 |
+
ESM-2 here is trained out to 270K updates.
|
| 176 |
+
The 15B parameter ESM-2 was not trained out to 500K updates
|
| 177 |
+
This is a model designed to test the capabilities of the language model
|
| 178 |
+
when ablated for number of parameters in the language model.
|
| 179 |
+
See table S1 in (Lin et al, 2022).
|
| 180 |
+
"""
|
| 181 |
+
return _load_model("esmfold_structure_module_only_15B")
|
esm/source/esm/esmfold/v1/tri_self_attn_block.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
import torch
|
| 6 |
+
from openfold.model.triangular_attention import (
|
| 7 |
+
TriangleAttentionEndingNode,
|
| 8 |
+
TriangleAttentionStartingNode,
|
| 9 |
+
)
|
| 10 |
+
from openfold.model.triangular_multiplicative_update import (
|
| 11 |
+
TriangleMultiplicationIncoming,
|
| 12 |
+
TriangleMultiplicationOutgoing,
|
| 13 |
+
)
|
| 14 |
+
from torch import nn
|
| 15 |
+
|
| 16 |
+
from esm.esmfold.v1.misc import (
|
| 17 |
+
Attention,
|
| 18 |
+
Dropout,
|
| 19 |
+
PairToSequence,
|
| 20 |
+
ResidueMLP,
|
| 21 |
+
SequenceToPair,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class TriangularSelfAttentionBlock(nn.Module):
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
sequence_state_dim,
|
| 29 |
+
pairwise_state_dim,
|
| 30 |
+
sequence_head_width,
|
| 31 |
+
pairwise_head_width,
|
| 32 |
+
dropout=0,
|
| 33 |
+
**__kwargs,
|
| 34 |
+
):
|
| 35 |
+
super().__init__()
|
| 36 |
+
|
| 37 |
+
assert sequence_state_dim % sequence_head_width == 0
|
| 38 |
+
assert pairwise_state_dim % pairwise_head_width == 0
|
| 39 |
+
sequence_num_heads = sequence_state_dim // sequence_head_width
|
| 40 |
+
pairwise_num_heads = pairwise_state_dim // pairwise_head_width
|
| 41 |
+
assert sequence_state_dim == sequence_num_heads * sequence_head_width
|
| 42 |
+
assert pairwise_state_dim == pairwise_num_heads * pairwise_head_width
|
| 43 |
+
assert pairwise_state_dim % 2 == 0
|
| 44 |
+
|
| 45 |
+
self.sequence_state_dim = sequence_state_dim
|
| 46 |
+
self.pairwise_state_dim = pairwise_state_dim
|
| 47 |
+
|
| 48 |
+
self.layernorm_1 = nn.LayerNorm(sequence_state_dim)
|
| 49 |
+
|
| 50 |
+
self.sequence_to_pair = SequenceToPair(
|
| 51 |
+
sequence_state_dim, pairwise_state_dim // 2, pairwise_state_dim
|
| 52 |
+
)
|
| 53 |
+
self.pair_to_sequence = PairToSequence(pairwise_state_dim, sequence_num_heads)
|
| 54 |
+
|
| 55 |
+
self.seq_attention = Attention(
|
| 56 |
+
sequence_state_dim, sequence_num_heads, sequence_head_width, gated=True
|
| 57 |
+
)
|
| 58 |
+
self.tri_mul_out = TriangleMultiplicationOutgoing(
|
| 59 |
+
pairwise_state_dim,
|
| 60 |
+
pairwise_state_dim,
|
| 61 |
+
)
|
| 62 |
+
self.tri_mul_in = TriangleMultiplicationIncoming(
|
| 63 |
+
pairwise_state_dim,
|
| 64 |
+
pairwise_state_dim,
|
| 65 |
+
)
|
| 66 |
+
self.tri_att_start = TriangleAttentionStartingNode(
|
| 67 |
+
pairwise_state_dim,
|
| 68 |
+
pairwise_head_width,
|
| 69 |
+
pairwise_num_heads,
|
| 70 |
+
inf=1e9,
|
| 71 |
+
) # type: ignore
|
| 72 |
+
self.tri_att_end = TriangleAttentionEndingNode(
|
| 73 |
+
pairwise_state_dim,
|
| 74 |
+
pairwise_head_width,
|
| 75 |
+
pairwise_num_heads,
|
| 76 |
+
inf=1e9,
|
| 77 |
+
) # type: ignore
|
| 78 |
+
|
| 79 |
+
self.mlp_seq = ResidueMLP(sequence_state_dim, 4 * sequence_state_dim, dropout=dropout)
|
| 80 |
+
self.mlp_pair = ResidueMLP(pairwise_state_dim, 4 * pairwise_state_dim, dropout=dropout)
|
| 81 |
+
|
| 82 |
+
assert dropout < 0.4
|
| 83 |
+
self.drop = nn.Dropout(dropout)
|
| 84 |
+
self.row_drop = Dropout(dropout * 2, 2)
|
| 85 |
+
self.col_drop = Dropout(dropout * 2, 1)
|
| 86 |
+
|
| 87 |
+
torch.nn.init.zeros_(self.tri_mul_in.linear_z.weight)
|
| 88 |
+
torch.nn.init.zeros_(self.tri_mul_in.linear_z.bias)
|
| 89 |
+
torch.nn.init.zeros_(self.tri_mul_out.linear_z.weight)
|
| 90 |
+
torch.nn.init.zeros_(self.tri_mul_out.linear_z.bias)
|
| 91 |
+
torch.nn.init.zeros_(self.tri_att_start.mha.linear_o.weight)
|
| 92 |
+
torch.nn.init.zeros_(self.tri_att_start.mha.linear_o.bias)
|
| 93 |
+
torch.nn.init.zeros_(self.tri_att_end.mha.linear_o.weight)
|
| 94 |
+
torch.nn.init.zeros_(self.tri_att_end.mha.linear_o.bias)
|
| 95 |
+
|
| 96 |
+
torch.nn.init.zeros_(self.sequence_to_pair.o_proj.weight)
|
| 97 |
+
torch.nn.init.zeros_(self.sequence_to_pair.o_proj.bias)
|
| 98 |
+
torch.nn.init.zeros_(self.pair_to_sequence.linear.weight)
|
| 99 |
+
torch.nn.init.zeros_(self.seq_attention.o_proj.weight)
|
| 100 |
+
torch.nn.init.zeros_(self.seq_attention.o_proj.bias)
|
| 101 |
+
torch.nn.init.zeros_(self.mlp_seq.mlp[-2].weight)
|
| 102 |
+
torch.nn.init.zeros_(self.mlp_seq.mlp[-2].bias)
|
| 103 |
+
torch.nn.init.zeros_(self.mlp_pair.mlp[-2].weight)
|
| 104 |
+
torch.nn.init.zeros_(self.mlp_pair.mlp[-2].bias)
|
| 105 |
+
|
| 106 |
+
def forward(self, sequence_state, pairwise_state, mask=None, chunk_size=None, **__kwargs):
|
| 107 |
+
"""
|
| 108 |
+
Inputs:
|
| 109 |
+
sequence_state: B x L x sequence_state_dim
|
| 110 |
+
pairwise_state: B x L x L x pairwise_state_dim
|
| 111 |
+
mask: B x L boolean tensor of valid positions
|
| 112 |
+
|
| 113 |
+
Output:
|
| 114 |
+
sequence_state: B x L x sequence_state_dim
|
| 115 |
+
pairwise_state: B x L x L x pairwise_state_dim
|
| 116 |
+
"""
|
| 117 |
+
assert len(sequence_state.shape) == 3
|
| 118 |
+
assert len(pairwise_state.shape) == 4
|
| 119 |
+
if mask is not None:
|
| 120 |
+
assert len(mask.shape) == 2
|
| 121 |
+
|
| 122 |
+
batch_dim, seq_dim, sequence_state_dim = sequence_state.shape
|
| 123 |
+
pairwise_state_dim = pairwise_state.shape[3]
|
| 124 |
+
assert sequence_state_dim == self.sequence_state_dim
|
| 125 |
+
assert pairwise_state_dim == self.pairwise_state_dim
|
| 126 |
+
assert batch_dim == pairwise_state.shape[0]
|
| 127 |
+
assert seq_dim == pairwise_state.shape[1]
|
| 128 |
+
assert seq_dim == pairwise_state.shape[2]
|
| 129 |
+
|
| 130 |
+
# Update sequence state
|
| 131 |
+
bias = self.pair_to_sequence(pairwise_state)
|
| 132 |
+
|
| 133 |
+
# Self attention with bias + mlp.
|
| 134 |
+
y = self.layernorm_1(sequence_state)
|
| 135 |
+
y, _ = self.seq_attention(y, mask=mask, bias=bias)
|
| 136 |
+
sequence_state = sequence_state + self.drop(y)
|
| 137 |
+
sequence_state = self.mlp_seq(sequence_state)
|
| 138 |
+
|
| 139 |
+
# Update pairwise state
|
| 140 |
+
pairwise_state = pairwise_state + self.sequence_to_pair(sequence_state)
|
| 141 |
+
|
| 142 |
+
# Axial attention with triangular bias.
|
| 143 |
+
tri_mask = mask.unsqueeze(2) * mask.unsqueeze(1) if mask is not None else None
|
| 144 |
+
pairwise_state = pairwise_state + self.row_drop(
|
| 145 |
+
self.tri_mul_out(pairwise_state, mask=tri_mask)
|
| 146 |
+
)
|
| 147 |
+
pairwise_state = pairwise_state + self.col_drop(
|
| 148 |
+
self.tri_mul_in(pairwise_state, mask=tri_mask)
|
| 149 |
+
)
|
| 150 |
+
pairwise_state = pairwise_state + self.row_drop(
|
| 151 |
+
self.tri_att_start(pairwise_state, mask=tri_mask, chunk_size=chunk_size)
|
| 152 |
+
)
|
| 153 |
+
pairwise_state = pairwise_state + self.col_drop(
|
| 154 |
+
self.tri_att_end(pairwise_state, mask=tri_mask, chunk_size=chunk_size)
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# MLP over pairs.
|
| 158 |
+
pairwise_state = self.mlp_pair(pairwise_state)
|
| 159 |
+
|
| 160 |
+
return sequence_state, pairwise_state
|
esm/source/esm/esmfold/v1/trunk.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
import typing as T
|
| 6 |
+
from contextlib import ExitStack
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn as nn
|
| 11 |
+
from openfold.model.structure_module import StructureModule
|
| 12 |
+
|
| 13 |
+
from esm.esmfold.v1.tri_self_attn_block import TriangularSelfAttentionBlock
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class StructureModuleConfig:
|
| 18 |
+
c_s: int = 384
|
| 19 |
+
c_z: int = 128
|
| 20 |
+
c_ipa: int = 16
|
| 21 |
+
c_resnet: int = 128
|
| 22 |
+
no_heads_ipa: int = 12
|
| 23 |
+
no_qk_points: int = 4
|
| 24 |
+
no_v_points: int = 8
|
| 25 |
+
dropout_rate: float = 0.1
|
| 26 |
+
no_blocks: int = 8
|
| 27 |
+
no_transition_layers: int = 1
|
| 28 |
+
no_resnet_blocks: int = 2
|
| 29 |
+
no_angles: int = 7
|
| 30 |
+
trans_scale_factor: int = 10
|
| 31 |
+
epsilon: float = 1e-8
|
| 32 |
+
inf: float = 1e5
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class FoldingTrunkConfig:
|
| 37 |
+
_name: str = "FoldingTrunkConfig"
|
| 38 |
+
num_blocks: int = 48
|
| 39 |
+
sequence_state_dim: int = 1024
|
| 40 |
+
pairwise_state_dim: int = 128
|
| 41 |
+
sequence_head_width: int = 32
|
| 42 |
+
pairwise_head_width: int = 32
|
| 43 |
+
position_bins: int = 32
|
| 44 |
+
dropout: float = 0
|
| 45 |
+
layer_drop: float = 0
|
| 46 |
+
cpu_grad_checkpoint: bool = False
|
| 47 |
+
|
| 48 |
+
max_recycles: int = 4
|
| 49 |
+
chunk_size: T.Optional[int] = None
|
| 50 |
+
|
| 51 |
+
structure_module: StructureModuleConfig = StructureModuleConfig()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def get_axial_mask(mask):
|
| 55 |
+
"""
|
| 56 |
+
Helper to convert B x L mask of valid positions to axial mask used
|
| 57 |
+
in row column attentions.
|
| 58 |
+
|
| 59 |
+
Input:
|
| 60 |
+
mask: B x L tensor of booleans
|
| 61 |
+
|
| 62 |
+
Output:
|
| 63 |
+
mask: B x L x L tensor of booleans
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
if mask is None:
|
| 67 |
+
return None
|
| 68 |
+
assert len(mask.shape) == 2
|
| 69 |
+
batch_dim, seq_dim = mask.shape
|
| 70 |
+
m = mask.unsqueeze(1).expand(batch_dim, seq_dim, seq_dim)
|
| 71 |
+
m = m.reshape(batch_dim * seq_dim, seq_dim)
|
| 72 |
+
return m
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class RelativePosition(nn.Module):
|
| 76 |
+
def __init__(self, bins, pairwise_state_dim):
|
| 77 |
+
super().__init__()
|
| 78 |
+
self.bins = bins
|
| 79 |
+
|
| 80 |
+
# Note an additional offset is used so that the 0th position
|
| 81 |
+
# is reserved for masked pairs.
|
| 82 |
+
self.embedding = torch.nn.Embedding(2 * bins + 2, pairwise_state_dim)
|
| 83 |
+
|
| 84 |
+
def forward(self, residue_index, mask=None):
|
| 85 |
+
"""
|
| 86 |
+
Input:
|
| 87 |
+
residue_index: B x L tensor of indices (dytpe=torch.long)
|
| 88 |
+
mask: B x L tensor of booleans
|
| 89 |
+
|
| 90 |
+
Output:
|
| 91 |
+
pairwise_state: B x L x L x pairwise_state_dim tensor of embeddings
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
assert residue_index.dtype == torch.long
|
| 95 |
+
if mask is not None:
|
| 96 |
+
assert residue_index.shape == mask.shape
|
| 97 |
+
|
| 98 |
+
diff = residue_index[:, None, :] - residue_index[:, :, None]
|
| 99 |
+
diff = diff.clamp(-self.bins, self.bins)
|
| 100 |
+
diff = diff + self.bins + 1 # Add 1 to adjust for padding index.
|
| 101 |
+
|
| 102 |
+
if mask is not None:
|
| 103 |
+
mask = mask[:, None, :] * mask[:, :, None]
|
| 104 |
+
diff[mask == False] = 0
|
| 105 |
+
|
| 106 |
+
output = self.embedding(diff)
|
| 107 |
+
return output
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class FoldingTrunk(nn.Module):
|
| 111 |
+
def __init__(self, **kwargs):
|
| 112 |
+
super().__init__()
|
| 113 |
+
self.cfg = FoldingTrunkConfig(**kwargs)
|
| 114 |
+
assert self.cfg.max_recycles > 0
|
| 115 |
+
|
| 116 |
+
c_s = self.cfg.sequence_state_dim
|
| 117 |
+
c_z = self.cfg.pairwise_state_dim
|
| 118 |
+
|
| 119 |
+
assert c_s % self.cfg.sequence_head_width == 0
|
| 120 |
+
assert c_z % self.cfg.pairwise_head_width == 0
|
| 121 |
+
block = TriangularSelfAttentionBlock
|
| 122 |
+
|
| 123 |
+
self.pairwise_positional_embedding = RelativePosition(self.cfg.position_bins, c_z)
|
| 124 |
+
|
| 125 |
+
self.blocks = nn.ModuleList(
|
| 126 |
+
[
|
| 127 |
+
block(
|
| 128 |
+
sequence_state_dim=c_s,
|
| 129 |
+
pairwise_state_dim=c_z,
|
| 130 |
+
sequence_head_width=self.cfg.sequence_head_width,
|
| 131 |
+
pairwise_head_width=self.cfg.pairwise_head_width,
|
| 132 |
+
dropout=self.cfg.dropout,
|
| 133 |
+
)
|
| 134 |
+
for i in range(self.cfg.num_blocks)
|
| 135 |
+
]
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
self.recycle_bins = 15
|
| 139 |
+
self.recycle_s_norm = nn.LayerNorm(c_s)
|
| 140 |
+
self.recycle_z_norm = nn.LayerNorm(c_z)
|
| 141 |
+
self.recycle_disto = nn.Embedding(self.recycle_bins, c_z)
|
| 142 |
+
self.recycle_disto.weight[0].detach().zero_()
|
| 143 |
+
|
| 144 |
+
self.structure_module = StructureModule(**self.cfg.structure_module) # type: ignore
|
| 145 |
+
self.trunk2sm_s = nn.Linear(c_s, self.structure_module.c_s)
|
| 146 |
+
self.trunk2sm_z = nn.Linear(c_z, self.structure_module.c_z)
|
| 147 |
+
|
| 148 |
+
self.chunk_size = self.cfg.chunk_size
|
| 149 |
+
|
| 150 |
+
def set_chunk_size(self, chunk_size):
|
| 151 |
+
# This parameter means the axial attention will be computed
|
| 152 |
+
# in a chunked manner. This should make the memory used more or less O(L) instead of O(L^2).
|
| 153 |
+
# It's equivalent to running a for loop over chunks of the dimension we're iterative over,
|
| 154 |
+
# where the chunk_size is the size of the chunks, so 128 would mean to parse 128-lengthed chunks.
|
| 155 |
+
self.chunk_size = chunk_size
|
| 156 |
+
|
| 157 |
+
def forward(self, seq_feats, pair_feats, true_aa, residx, mask, no_recycles: T.Optional[int] = None):
|
| 158 |
+
"""
|
| 159 |
+
Inputs:
|
| 160 |
+
seq_feats: B x L x C tensor of sequence features
|
| 161 |
+
pair_feats: B x L x L x C tensor of pair features
|
| 162 |
+
residx: B x L long tensor giving the position in the sequence
|
| 163 |
+
mask: B x L boolean tensor indicating valid residues
|
| 164 |
+
|
| 165 |
+
Output:
|
| 166 |
+
predicted_structure: B x L x (num_atoms_per_residue * 3) tensor wrapped in a Coordinates object
|
| 167 |
+
"""
|
| 168 |
+
|
| 169 |
+
device = seq_feats.device
|
| 170 |
+
s_s_0 = seq_feats
|
| 171 |
+
s_z_0 = pair_feats
|
| 172 |
+
|
| 173 |
+
if no_recycles is None:
|
| 174 |
+
no_recycles = self.cfg.max_recycles
|
| 175 |
+
else:
|
| 176 |
+
assert no_recycles >= 0, "Number of recycles must not be negative."
|
| 177 |
+
no_recycles += 1 # First 'recycle' is just the standard forward pass through the model.
|
| 178 |
+
|
| 179 |
+
def trunk_iter(s, z, residx, mask):
|
| 180 |
+
z = z + self.pairwise_positional_embedding(residx, mask=mask)
|
| 181 |
+
|
| 182 |
+
for block in self.blocks:
|
| 183 |
+
s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size)
|
| 184 |
+
return s, z
|
| 185 |
+
|
| 186 |
+
s_s = s_s_0
|
| 187 |
+
s_z = s_z_0
|
| 188 |
+
recycle_s = torch.zeros_like(s_s)
|
| 189 |
+
recycle_z = torch.zeros_like(s_z)
|
| 190 |
+
recycle_bins = torch.zeros(*s_z.shape[:-1], device=device, dtype=torch.int64)
|
| 191 |
+
|
| 192 |
+
assert no_recycles > 0
|
| 193 |
+
for recycle_idx in range(no_recycles):
|
| 194 |
+
with ExitStack() if recycle_idx == no_recycles - 1 else torch.no_grad():
|
| 195 |
+
# === Recycling ===
|
| 196 |
+
recycle_s = self.recycle_s_norm(recycle_s.detach())
|
| 197 |
+
recycle_z = self.recycle_z_norm(recycle_z.detach())
|
| 198 |
+
recycle_z += self.recycle_disto(recycle_bins.detach())
|
| 199 |
+
|
| 200 |
+
s_s, s_z = trunk_iter(s_s_0 + recycle_s, s_z_0 + recycle_z, residx, mask)
|
| 201 |
+
|
| 202 |
+
# === Structure module ===
|
| 203 |
+
structure = self.structure_module(
|
| 204 |
+
{"single": self.trunk2sm_s(s_s), "pair": self.trunk2sm_z(s_z)},
|
| 205 |
+
true_aa,
|
| 206 |
+
mask.float(),
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
recycle_s = s_s
|
| 210 |
+
recycle_z = s_z
|
| 211 |
+
# Distogram needs the N, CA, C coordinates, and bin constants same as alphafold.
|
| 212 |
+
recycle_bins = FoldingTrunk.distogram(
|
| 213 |
+
structure["positions"][-1][:, :, :3],
|
| 214 |
+
3.375,
|
| 215 |
+
21.375,
|
| 216 |
+
self.recycle_bins,
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
assert isinstance(structure, dict) # type: ignore
|
| 220 |
+
structure["s_s"] = s_s
|
| 221 |
+
structure["s_z"] = s_z
|
| 222 |
+
|
| 223 |
+
return structure
|
| 224 |
+
|
| 225 |
+
@staticmethod
|
| 226 |
+
def distogram(coords, min_bin, max_bin, num_bins):
|
| 227 |
+
# Coords are [... L x 3 x 3], where it's [N, CA, C] x 3 coordinates.
|
| 228 |
+
boundaries = torch.linspace(
|
| 229 |
+
min_bin,
|
| 230 |
+
max_bin,
|
| 231 |
+
num_bins - 1,
|
| 232 |
+
device=coords.device,
|
| 233 |
+
)
|
| 234 |
+
boundaries = boundaries**2
|
| 235 |
+
N, CA, C = [x.squeeze(-2) for x in coords.chunk(3, dim=-2)]
|
| 236 |
+
# Infer CB coordinates.
|
| 237 |
+
b = CA - N
|
| 238 |
+
c = C - CA
|
| 239 |
+
a = b.cross(c, dim=-1)
|
| 240 |
+
CB = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + CA
|
| 241 |
+
dists = (CB[..., None, :, :] - CB[..., :, None, :]).pow(2).sum(dim=-1, keepdims=True)
|
| 242 |
+
bins = torch.sum(dists > boundaries, dim=-1) # [..., L, L]
|
| 243 |
+
return bins
|
esm/source/esm/inverse_folding/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from . import gvp_transformer
|
| 7 |
+
from . import util
|
| 8 |
+
from . import multichain_util
|
esm/source/esm/inverse_folding/features.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
#
|
| 6 |
+
# Portions of this file were adapted from the open source code for the following
|
| 7 |
+
# two papers:
|
| 8 |
+
#
|
| 9 |
+
# Ingraham, J., Garg, V., Barzilay, R., & Jaakkola, T. (2019). Generative
|
| 10 |
+
# models for graph-based protein design. Advances in Neural Information
|
| 11 |
+
# Processing Systems, 32.
|
| 12 |
+
#
|
| 13 |
+
# Jing, B., Eismann, S., Suriana, P., Townshend, R. J. L., & Dror, R. (2020).
|
| 14 |
+
# Learning from Protein Structure with Geometric Vector Perceptrons. In
|
| 15 |
+
# International Conference on Learning Representations.
|
| 16 |
+
#
|
| 17 |
+
# MIT License
|
| 18 |
+
#
|
| 19 |
+
# Copyright (c) 2020 Bowen Jing, Stephan Eismann, Patricia Suriana, Raphael Townshend, Ron Dror
|
| 20 |
+
#
|
| 21 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 22 |
+
# of this software and associated documentation files (the "Software"), to deal
|
| 23 |
+
# in the Software without restriction, including without limitation the rights
|
| 24 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 25 |
+
# copies of the Software, and to permit persons to whom the Software is
|
| 26 |
+
# furnished to do so, subject to the following conditions:
|
| 27 |
+
#
|
| 28 |
+
# The above copyright notice and this permission notice shall be included in all
|
| 29 |
+
# copies or substantial portions of the Software.
|
| 30 |
+
#
|
| 31 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 32 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 33 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 34 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 35 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 36 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 37 |
+
# SOFTWARE.
|
| 38 |
+
#
|
| 39 |
+
# ================================================================
|
| 40 |
+
# The below license applies to the portions of the code (parts of
|
| 41 |
+
# src/datasets.py and src/models.py) adapted from Ingraham, et al.
|
| 42 |
+
# ================================================================
|
| 43 |
+
#
|
| 44 |
+
# MIT License
|
| 45 |
+
#
|
| 46 |
+
# Copyright (c) 2019 John Ingraham, Vikas Garg, Regina Barzilay, Tommi Jaakkola
|
| 47 |
+
#
|
| 48 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 49 |
+
# of this software and associated documentation files (the "Software"), to deal
|
| 50 |
+
# in the Software without restriction, including without limitation the rights
|
| 51 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 52 |
+
# copies of the Software, and to permit persons to whom the Software is
|
| 53 |
+
# furnished to do so, subject to the following conditions:
|
| 54 |
+
#
|
| 55 |
+
# The above copyright notice and this permission notice shall be included in all
|
| 56 |
+
# copies or substantial portions of the Software.
|
| 57 |
+
#
|
| 58 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 59 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 60 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 61 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 62 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 63 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 64 |
+
# SOFTWARE.
|
| 65 |
+
|
| 66 |
+
import math
|
| 67 |
+
import numpy as np
|
| 68 |
+
import torch
|
| 69 |
+
import torch.nn as nn
|
| 70 |
+
import torch.nn.functional as F
|
| 71 |
+
|
| 72 |
+
from .gvp_utils import flatten_graph
|
| 73 |
+
from .gvp_modules import GVP, LayerNorm
|
| 74 |
+
from .util import normalize, norm, nan_to_num, rbf
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class GVPInputFeaturizer(nn.Module):
|
| 78 |
+
|
| 79 |
+
@staticmethod
|
| 80 |
+
def get_node_features(coords, coord_mask, with_coord_mask=True):
|
| 81 |
+
# scalar features
|
| 82 |
+
node_scalar_features = GVPInputFeaturizer._dihedrals(coords)
|
| 83 |
+
if with_coord_mask:
|
| 84 |
+
node_scalar_features = torch.cat([
|
| 85 |
+
node_scalar_features,
|
| 86 |
+
coord_mask.float().unsqueeze(-1)
|
| 87 |
+
], dim=-1)
|
| 88 |
+
# vector features
|
| 89 |
+
X_ca = coords[:, :, 1]
|
| 90 |
+
orientations = GVPInputFeaturizer._orientations(X_ca)
|
| 91 |
+
sidechains = GVPInputFeaturizer._sidechains(coords)
|
| 92 |
+
node_vector_features = torch.cat([orientations, sidechains.unsqueeze(-2)], dim=-2)
|
| 93 |
+
return node_scalar_features, node_vector_features
|
| 94 |
+
|
| 95 |
+
@staticmethod
|
| 96 |
+
def _orientations(X):
|
| 97 |
+
forward = normalize(X[:, 1:] - X[:, :-1])
|
| 98 |
+
backward = normalize(X[:, :-1] - X[:, 1:])
|
| 99 |
+
forward = F.pad(forward, [0, 0, 0, 1])
|
| 100 |
+
backward = F.pad(backward, [0, 0, 1, 0])
|
| 101 |
+
return torch.cat([forward.unsqueeze(-2), backward.unsqueeze(-2)], -2)
|
| 102 |
+
|
| 103 |
+
@staticmethod
|
| 104 |
+
def _sidechains(X):
|
| 105 |
+
n, origin, c = X[:, :, 0], X[:, :, 1], X[:, :, 2]
|
| 106 |
+
c, n = normalize(c - origin), normalize(n - origin)
|
| 107 |
+
bisector = normalize(c + n)
|
| 108 |
+
perp = normalize(torch.cross(c, n, dim=-1))
|
| 109 |
+
vec = -bisector * math.sqrt(1 / 3) - perp * math.sqrt(2 / 3)
|
| 110 |
+
return vec
|
| 111 |
+
|
| 112 |
+
@staticmethod
|
| 113 |
+
def _dihedrals(X, eps=1e-7):
|
| 114 |
+
X = torch.flatten(X[:, :, :3], 1, 2)
|
| 115 |
+
bsz = X.shape[0]
|
| 116 |
+
dX = X[:, 1:] - X[:, :-1]
|
| 117 |
+
U = normalize(dX, dim=-1)
|
| 118 |
+
u_2 = U[:, :-2]
|
| 119 |
+
u_1 = U[:, 1:-1]
|
| 120 |
+
u_0 = U[:, 2:]
|
| 121 |
+
|
| 122 |
+
# Backbone normals
|
| 123 |
+
n_2 = normalize(torch.cross(u_2, u_1, dim=-1), dim=-1)
|
| 124 |
+
n_1 = normalize(torch.cross(u_1, u_0, dim=-1), dim=-1)
|
| 125 |
+
|
| 126 |
+
# Angle between normals
|
| 127 |
+
cosD = torch.sum(n_2 * n_1, -1)
|
| 128 |
+
cosD = torch.clamp(cosD, -1 + eps, 1 - eps)
|
| 129 |
+
D = torch.sign(torch.sum(u_2 * n_1, -1)) * torch.acos(cosD)
|
| 130 |
+
|
| 131 |
+
# This scheme will remove phi[0], psi[-1], omega[-1]
|
| 132 |
+
D = F.pad(D, [1, 2])
|
| 133 |
+
D = torch.reshape(D, [bsz, -1, 3])
|
| 134 |
+
# Lift angle representations to the circle
|
| 135 |
+
D_features = torch.cat([torch.cos(D), torch.sin(D)], -1)
|
| 136 |
+
return D_features
|
| 137 |
+
|
| 138 |
+
@staticmethod
|
| 139 |
+
def _positional_embeddings(edge_index,
|
| 140 |
+
num_embeddings=None,
|
| 141 |
+
num_positional_embeddings=16,
|
| 142 |
+
period_range=[2, 1000]):
|
| 143 |
+
# From https://github.com/jingraham/neurips19-graph-protein-design
|
| 144 |
+
num_embeddings = num_embeddings or num_positional_embeddings
|
| 145 |
+
d = edge_index[0] - edge_index[1]
|
| 146 |
+
|
| 147 |
+
frequency = torch.exp(
|
| 148 |
+
torch.arange(0, num_embeddings, 2, dtype=torch.float32,
|
| 149 |
+
device=edge_index.device)
|
| 150 |
+
* -(np.log(10000.0) / num_embeddings)
|
| 151 |
+
)
|
| 152 |
+
angles = d.unsqueeze(-1) * frequency
|
| 153 |
+
E = torch.cat((torch.cos(angles), torch.sin(angles)), -1)
|
| 154 |
+
return E
|
| 155 |
+
|
| 156 |
+
@staticmethod
|
| 157 |
+
def _dist(X, coord_mask, padding_mask, top_k_neighbors, eps=1e-8):
|
| 158 |
+
""" Pairwise euclidean distances """
|
| 159 |
+
bsz, maxlen = X.size(0), X.size(1)
|
| 160 |
+
coord_mask_2D = torch.unsqueeze(coord_mask,1) * torch.unsqueeze(coord_mask,2)
|
| 161 |
+
residue_mask = ~padding_mask
|
| 162 |
+
residue_mask_2D = torch.unsqueeze(residue_mask,1) * torch.unsqueeze(residue_mask,2)
|
| 163 |
+
dX = torch.unsqueeze(X,1) - torch.unsqueeze(X,2)
|
| 164 |
+
D = coord_mask_2D * norm(dX, dim=-1)
|
| 165 |
+
|
| 166 |
+
# sorting preference: first those with coords, then among the residues that
|
| 167 |
+
# exist but are masked use distance in sequence as tie breaker, and then the
|
| 168 |
+
# residues that came from padding are last
|
| 169 |
+
seqpos = torch.arange(maxlen, device=X.device)
|
| 170 |
+
Dseq = torch.abs(seqpos.unsqueeze(1) - seqpos.unsqueeze(0)).repeat(bsz, 1, 1)
|
| 171 |
+
D_adjust = nan_to_num(D) + (~coord_mask_2D) * (1e8 + Dseq*1e6) + (
|
| 172 |
+
~residue_mask_2D) * (1e10)
|
| 173 |
+
|
| 174 |
+
if top_k_neighbors == -1:
|
| 175 |
+
D_neighbors = D_adjust
|
| 176 |
+
E_idx = seqpos.repeat(
|
| 177 |
+
*D_neighbors.shape[:-1], 1)
|
| 178 |
+
else:
|
| 179 |
+
# Identify k nearest neighbors (including self)
|
| 180 |
+
k = min(top_k_neighbors, X.size(1))
|
| 181 |
+
D_neighbors, E_idx = torch.topk(D_adjust, k, dim=-1, largest=False)
|
| 182 |
+
|
| 183 |
+
coord_mask_neighbors = (D_neighbors < 5e7)
|
| 184 |
+
residue_mask_neighbors = (D_neighbors < 5e9)
|
| 185 |
+
return D_neighbors, E_idx, coord_mask_neighbors, residue_mask_neighbors
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
class Normalize(nn.Module):
|
| 189 |
+
def __init__(self, features, epsilon=1e-6):
|
| 190 |
+
super(Normalize, self).__init__()
|
| 191 |
+
self.gain = nn.Parameter(torch.ones(features))
|
| 192 |
+
self.bias = nn.Parameter(torch.zeros(features))
|
| 193 |
+
self.epsilon = epsilon
|
| 194 |
+
|
| 195 |
+
def forward(self, x, dim=-1):
|
| 196 |
+
mu = x.mean(dim, keepdim=True)
|
| 197 |
+
sigma = torch.sqrt(x.var(dim, keepdim=True) + self.epsilon)
|
| 198 |
+
gain = self.gain
|
| 199 |
+
bias = self.bias
|
| 200 |
+
# Reshape
|
| 201 |
+
if dim != -1:
|
| 202 |
+
shape = [1] * len(mu.size())
|
| 203 |
+
shape[dim] = self.gain.size()[0]
|
| 204 |
+
gain = gain.view(shape)
|
| 205 |
+
bias = bias.view(shape)
|
| 206 |
+
return gain * (x - mu) / (sigma + self.epsilon) + bias
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
class DihedralFeatures(nn.Module):
|
| 210 |
+
def __init__(self, node_embed_dim):
|
| 211 |
+
""" Embed dihedral angle features. """
|
| 212 |
+
super(DihedralFeatures, self).__init__()
|
| 213 |
+
# 3 dihedral angles; sin and cos of each angle
|
| 214 |
+
node_in = 6
|
| 215 |
+
# Normalization and embedding
|
| 216 |
+
self.node_embedding = nn.Linear(node_in, node_embed_dim, bias=True)
|
| 217 |
+
self.norm_nodes = Normalize(node_embed_dim)
|
| 218 |
+
|
| 219 |
+
def forward(self, X):
|
| 220 |
+
""" Featurize coordinates as an attributed graph """
|
| 221 |
+
V = self._dihedrals(X)
|
| 222 |
+
V = self.node_embedding(V)
|
| 223 |
+
V = self.norm_nodes(V)
|
| 224 |
+
return V
|
| 225 |
+
|
| 226 |
+
@staticmethod
|
| 227 |
+
def _dihedrals(X, eps=1e-7, return_angles=False):
|
| 228 |
+
# First 3 coordinates are N, CA, C
|
| 229 |
+
X = X[:,:,:3,:].reshape(X.shape[0], 3*X.shape[1], 3)
|
| 230 |
+
|
| 231 |
+
# Shifted slices of unit vectors
|
| 232 |
+
dX = X[:,1:,:] - X[:,:-1,:]
|
| 233 |
+
U = F.normalize(dX, dim=-1)
|
| 234 |
+
u_2 = U[:,:-2,:]
|
| 235 |
+
u_1 = U[:,1:-1,:]
|
| 236 |
+
u_0 = U[:,2:,:]
|
| 237 |
+
# Backbone normals
|
| 238 |
+
n_2 = F.normalize(torch.cross(u_2, u_1, dim=-1), dim=-1)
|
| 239 |
+
n_1 = F.normalize(torch.cross(u_1, u_0, dim=-1), dim=-1)
|
| 240 |
+
|
| 241 |
+
# Angle between normals
|
| 242 |
+
cosD = (n_2 * n_1).sum(-1)
|
| 243 |
+
cosD = torch.clamp(cosD, -1+eps, 1-eps)
|
| 244 |
+
D = torch.sign((u_2 * n_1).sum(-1)) * torch.acos(cosD)
|
| 245 |
+
|
| 246 |
+
# This scheme will remove phi[0], psi[-1], omega[-1]
|
| 247 |
+
D = F.pad(D, (1,2), 'constant', 0)
|
| 248 |
+
D = D.view((D.size(0), int(D.size(1)/3), 3))
|
| 249 |
+
phi, psi, omega = torch.unbind(D,-1)
|
| 250 |
+
|
| 251 |
+
if return_angles:
|
| 252 |
+
return phi, psi, omega
|
| 253 |
+
|
| 254 |
+
# Lift angle representations to the circle
|
| 255 |
+
D_features = torch.cat((torch.cos(D), torch.sin(D)), 2)
|
| 256 |
+
return D_features
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
class GVPGraphEmbedding(GVPInputFeaturizer):
|
| 260 |
+
|
| 261 |
+
def __init__(self, args):
|
| 262 |
+
super().__init__()
|
| 263 |
+
self.top_k_neighbors = args.top_k_neighbors
|
| 264 |
+
self.num_positional_embeddings = 16
|
| 265 |
+
self.remove_edges_without_coords = True
|
| 266 |
+
node_input_dim = (7, 3)
|
| 267 |
+
edge_input_dim = (34, 1)
|
| 268 |
+
node_hidden_dim = (args.node_hidden_dim_scalar,
|
| 269 |
+
args.node_hidden_dim_vector)
|
| 270 |
+
edge_hidden_dim = (args.edge_hidden_dim_scalar,
|
| 271 |
+
args.edge_hidden_dim_vector)
|
| 272 |
+
self.embed_node = nn.Sequential(
|
| 273 |
+
GVP(node_input_dim, node_hidden_dim, activations=(None, None)),
|
| 274 |
+
LayerNorm(node_hidden_dim, eps=1e-4)
|
| 275 |
+
)
|
| 276 |
+
self.embed_edge = nn.Sequential(
|
| 277 |
+
GVP(edge_input_dim, edge_hidden_dim, activations=(None, None)),
|
| 278 |
+
LayerNorm(edge_hidden_dim, eps=1e-4)
|
| 279 |
+
)
|
| 280 |
+
self.embed_confidence = nn.Linear(16, args.node_hidden_dim_scalar)
|
| 281 |
+
|
| 282 |
+
def forward(self, coords, coord_mask, padding_mask, confidence):
|
| 283 |
+
with torch.no_grad():
|
| 284 |
+
node_features = self.get_node_features(coords, coord_mask)
|
| 285 |
+
edge_features, edge_index = self.get_edge_features(
|
| 286 |
+
coords, coord_mask, padding_mask)
|
| 287 |
+
node_embeddings_scalar, node_embeddings_vector = self.embed_node(node_features)
|
| 288 |
+
edge_embeddings = self.embed_edge(edge_features)
|
| 289 |
+
|
| 290 |
+
rbf_rep = rbf(confidence, 0., 1.)
|
| 291 |
+
node_embeddings = (
|
| 292 |
+
node_embeddings_scalar + self.embed_confidence(rbf_rep),
|
| 293 |
+
node_embeddings_vector
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
node_embeddings, edge_embeddings, edge_index = flatten_graph(
|
| 297 |
+
node_embeddings, edge_embeddings, edge_index)
|
| 298 |
+
return node_embeddings, edge_embeddings, edge_index
|
| 299 |
+
|
| 300 |
+
def get_edge_features(self, coords, coord_mask, padding_mask):
|
| 301 |
+
X_ca = coords[:, :, 1]
|
| 302 |
+
# Get distances to the top k neighbors
|
| 303 |
+
E_dist, E_idx, E_coord_mask, E_residue_mask = GVPInputFeaturizer._dist(
|
| 304 |
+
X_ca, coord_mask, padding_mask, self.top_k_neighbors)
|
| 305 |
+
# Flatten the graph to be batch size 1 for torch_geometric package
|
| 306 |
+
dest = E_idx
|
| 307 |
+
B, L, k = E_idx.shape[:3]
|
| 308 |
+
src = torch.arange(L, device=E_idx.device).view([1, L, 1]).expand(B, L, k)
|
| 309 |
+
# After flattening, [2, B, E]
|
| 310 |
+
edge_index = torch.stack([src, dest], dim=0).flatten(2, 3)
|
| 311 |
+
# After flattening, [B, E]
|
| 312 |
+
E_dist = E_dist.flatten(1, 2)
|
| 313 |
+
E_coord_mask = E_coord_mask.flatten(1, 2).unsqueeze(-1)
|
| 314 |
+
E_residue_mask = E_residue_mask.flatten(1, 2)
|
| 315 |
+
# Calculate relative positional embeddings and distance RBF
|
| 316 |
+
pos_embeddings = GVPInputFeaturizer._positional_embeddings(
|
| 317 |
+
edge_index,
|
| 318 |
+
num_positional_embeddings=self.num_positional_embeddings,
|
| 319 |
+
)
|
| 320 |
+
D_rbf = rbf(E_dist, 0., 20.)
|
| 321 |
+
# Calculate relative orientation
|
| 322 |
+
X_src = X_ca.unsqueeze(2).expand(-1, -1, k, -1).flatten(1, 2)
|
| 323 |
+
X_dest = torch.gather(
|
| 324 |
+
X_ca,
|
| 325 |
+
1,
|
| 326 |
+
edge_index[1, :, :].unsqueeze(-1).expand([B, L*k, 3])
|
| 327 |
+
)
|
| 328 |
+
coord_mask_src = coord_mask.unsqueeze(2).expand(-1, -1, k).flatten(1, 2)
|
| 329 |
+
coord_mask_dest = torch.gather(
|
| 330 |
+
coord_mask,
|
| 331 |
+
1,
|
| 332 |
+
edge_index[1, :, :].expand([B, L*k])
|
| 333 |
+
)
|
| 334 |
+
E_vectors = X_src - X_dest
|
| 335 |
+
# For the ones without coordinates, substitute in the average vector
|
| 336 |
+
E_vector_mean = torch.sum(E_vectors * E_coord_mask, dim=1,
|
| 337 |
+
keepdims=True) / torch.sum(E_coord_mask, dim=1, keepdims=True)
|
| 338 |
+
E_vectors = E_vectors * E_coord_mask + E_vector_mean * ~(E_coord_mask)
|
| 339 |
+
# Normalize and remove nans
|
| 340 |
+
edge_s = torch.cat([D_rbf, pos_embeddings], dim=-1)
|
| 341 |
+
edge_v = normalize(E_vectors).unsqueeze(-2)
|
| 342 |
+
edge_s, edge_v = map(nan_to_num, (edge_s, edge_v))
|
| 343 |
+
# Also add indications of whether the coordinates are present
|
| 344 |
+
edge_s = torch.cat([
|
| 345 |
+
edge_s,
|
| 346 |
+
(~coord_mask_src).float().unsqueeze(-1),
|
| 347 |
+
(~coord_mask_dest).float().unsqueeze(-1),
|
| 348 |
+
], dim=-1)
|
| 349 |
+
edge_index[:, ~E_residue_mask] = -1
|
| 350 |
+
if self.remove_edges_without_coords:
|
| 351 |
+
edge_index[:, ~E_coord_mask.squeeze(-1)] = -1
|
| 352 |
+
return (edge_s, edge_v), edge_index.transpose(0, 1)
|
esm/source/esm/inverse_folding/gvp_encoder.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from argparse import Namespace
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
|
| 12 |
+
from .features import GVPGraphEmbedding
|
| 13 |
+
from .gvp_modules import GVPConvLayer, LayerNorm
|
| 14 |
+
from .gvp_utils import unflatten_graph
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class GVPEncoder(nn.Module):
|
| 19 |
+
|
| 20 |
+
def __init__(self, args):
|
| 21 |
+
super().__init__()
|
| 22 |
+
self.args = args
|
| 23 |
+
self.embed_graph = GVPGraphEmbedding(args)
|
| 24 |
+
|
| 25 |
+
node_hidden_dim = (args.node_hidden_dim_scalar,
|
| 26 |
+
args.node_hidden_dim_vector)
|
| 27 |
+
edge_hidden_dim = (args.edge_hidden_dim_scalar,
|
| 28 |
+
args.edge_hidden_dim_vector)
|
| 29 |
+
|
| 30 |
+
conv_activations = (F.relu, torch.sigmoid)
|
| 31 |
+
self.encoder_layers = nn.ModuleList(
|
| 32 |
+
GVPConvLayer(
|
| 33 |
+
node_hidden_dim,
|
| 34 |
+
edge_hidden_dim,
|
| 35 |
+
drop_rate=args.dropout,
|
| 36 |
+
vector_gate=True,
|
| 37 |
+
attention_heads=0,
|
| 38 |
+
n_message=3,
|
| 39 |
+
conv_activations=conv_activations,
|
| 40 |
+
n_edge_gvps=0,
|
| 41 |
+
eps=1e-4,
|
| 42 |
+
layernorm=True,
|
| 43 |
+
)
|
| 44 |
+
for i in range(args.num_encoder_layers)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
def forward(self, coords, coord_mask, padding_mask, confidence):
|
| 48 |
+
node_embeddings, edge_embeddings, edge_index = self.embed_graph(
|
| 49 |
+
coords, coord_mask, padding_mask, confidence)
|
| 50 |
+
|
| 51 |
+
for i, layer in enumerate(self.encoder_layers):
|
| 52 |
+
node_embeddings, edge_embeddings = layer(node_embeddings,
|
| 53 |
+
edge_index, edge_embeddings)
|
| 54 |
+
|
| 55 |
+
node_embeddings = unflatten_graph(node_embeddings, coords.shape[0])
|
| 56 |
+
return node_embeddings
|
esm/source/esm/inverse_folding/gvp_modules.py
ADDED
|
@@ -0,0 +1,475 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contents of this file are from the open source code for
|
| 2 |
+
#
|
| 3 |
+
# Jing, B., Eismann, S., Suriana, P., Townshend, R. J. L., & Dror, R. (2020).
|
| 4 |
+
# Learning from Protein Structure with Geometric Vector Perceptrons. In
|
| 5 |
+
# International Conference on Learning Representations.
|
| 6 |
+
#
|
| 7 |
+
# MIT License
|
| 8 |
+
#
|
| 9 |
+
# Copyright (c) 2020 Bowen Jing, Stephan Eismann, Patricia Suriana, Raphael Townshend, Ron Dror
|
| 10 |
+
#
|
| 11 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 12 |
+
# of this software and associated documentation files (the "Software"), to deal
|
| 13 |
+
# in the Software without restriction, including without limitation the rights
|
| 14 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 15 |
+
# copies of the Software, and to permit persons to whom the Software is
|
| 16 |
+
# furnished to do so, subject to the following conditions:
|
| 17 |
+
#
|
| 18 |
+
# The above copyright notice and this permission notice shall be included in all
|
| 19 |
+
# copies or substantial portions of the Software.
|
| 20 |
+
#
|
| 21 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 22 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 23 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 24 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 25 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 26 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 27 |
+
# SOFTWARE.
|
| 28 |
+
|
| 29 |
+
import typing as T
|
| 30 |
+
import torch
|
| 31 |
+
from torch import nn
|
| 32 |
+
import torch.nn.functional as F
|
| 33 |
+
from torch_geometric.nn import MessagePassing
|
| 34 |
+
|
| 35 |
+
def tuple_size(tp):
|
| 36 |
+
return tuple([0 if a is None else a.size() for a in tp])
|
| 37 |
+
|
| 38 |
+
def tuple_sum(tp1, tp2):
|
| 39 |
+
s1, v1 = tp1
|
| 40 |
+
s2, v2 = tp2
|
| 41 |
+
if v2 is None and v2 is None:
|
| 42 |
+
return (s1 + s2, None)
|
| 43 |
+
return (s1 + s2, v1 + v2)
|
| 44 |
+
|
| 45 |
+
def tuple_cat(*args, dim=-1):
|
| 46 |
+
'''
|
| 47 |
+
Concatenates any number of tuples (s, V) elementwise.
|
| 48 |
+
|
| 49 |
+
:param dim: dimension along which to concatenate when viewed
|
| 50 |
+
as the `dim` index for the scalar-channel tensors.
|
| 51 |
+
This means that `dim=-1` will be applied as
|
| 52 |
+
`dim=-2` for the vector-channel tensors.
|
| 53 |
+
'''
|
| 54 |
+
dim %= len(args[0][0].shape)
|
| 55 |
+
s_args, v_args = list(zip(*args))
|
| 56 |
+
return torch.cat(s_args, dim=dim), torch.cat(v_args, dim=dim)
|
| 57 |
+
|
| 58 |
+
def tuple_index(x, idx):
|
| 59 |
+
'''
|
| 60 |
+
Indexes into a tuple (s, V) along the first dimension.
|
| 61 |
+
|
| 62 |
+
:param idx: any object which can be used to index into a `torch.Tensor`
|
| 63 |
+
'''
|
| 64 |
+
return x[0][idx], x[1][idx]
|
| 65 |
+
|
| 66 |
+
def randn(n, dims, device="cpu"):
|
| 67 |
+
'''
|
| 68 |
+
Returns random tuples (s, V) drawn elementwise from a normal distribution.
|
| 69 |
+
|
| 70 |
+
:param n: number of data points
|
| 71 |
+
:param dims: tuple of dimensions (n_scalar, n_vector)
|
| 72 |
+
|
| 73 |
+
:return: (s, V) with s.shape = (n, n_scalar) and
|
| 74 |
+
V.shape = (n, n_vector, 3)
|
| 75 |
+
'''
|
| 76 |
+
return torch.randn(n, dims[0], device=device), \
|
| 77 |
+
torch.randn(n, dims[1], 3, device=device)
|
| 78 |
+
|
| 79 |
+
def _norm_no_nan(x, axis=-1, keepdims=False, eps=1e-8, sqrt=True):
|
| 80 |
+
'''
|
| 81 |
+
L2 norm of tensor clamped above a minimum value `eps`.
|
| 82 |
+
|
| 83 |
+
:param sqrt: if `False`, returns the square of the L2 norm
|
| 84 |
+
'''
|
| 85 |
+
# clamp is slow
|
| 86 |
+
# out = torch.clamp(torch.sum(torch.square(x), axis, keepdims), min=eps)
|
| 87 |
+
out = torch.sum(torch.square(x), axis, keepdims) + eps
|
| 88 |
+
return torch.sqrt(out) if sqrt else out
|
| 89 |
+
|
| 90 |
+
def _split(x, nv):
|
| 91 |
+
'''
|
| 92 |
+
Splits a merged representation of (s, V) back into a tuple.
|
| 93 |
+
Should be used only with `_merge(s, V)` and only if the tuple
|
| 94 |
+
representation cannot be used.
|
| 95 |
+
|
| 96 |
+
:param x: the `torch.Tensor` returned from `_merge`
|
| 97 |
+
:param nv: the number of vector channels in the input to `_merge`
|
| 98 |
+
'''
|
| 99 |
+
v = torch.reshape(x[..., -3*nv:], x.shape[:-1] + (nv, 3))
|
| 100 |
+
s = x[..., :-3*nv]
|
| 101 |
+
return s, v
|
| 102 |
+
|
| 103 |
+
def _merge(s, v):
|
| 104 |
+
'''
|
| 105 |
+
Merges a tuple (s, V) into a single `torch.Tensor`, where the
|
| 106 |
+
vector channels are flattened and appended to the scalar channels.
|
| 107 |
+
Should be used only if the tuple representation cannot be used.
|
| 108 |
+
Use `_split(x, nv)` to reverse.
|
| 109 |
+
'''
|
| 110 |
+
v = torch.reshape(v, v.shape[:-2] + (3*v.shape[-2],))
|
| 111 |
+
return torch.cat([s, v], -1)
|
| 112 |
+
|
| 113 |
+
class GVP(nn.Module):
|
| 114 |
+
'''
|
| 115 |
+
Geometric Vector Perceptron. See manuscript and README.md
|
| 116 |
+
for more details.
|
| 117 |
+
|
| 118 |
+
:param in_dims: tuple (n_scalar, n_vector)
|
| 119 |
+
:param out_dims: tuple (n_scalar, n_vector)
|
| 120 |
+
:param h_dim: intermediate number of vector channels, optional
|
| 121 |
+
:param activations: tuple of functions (scalar_act, vector_act)
|
| 122 |
+
:param tuple_io: whether to keep accepting tuple inputs and outputs when vi
|
| 123 |
+
or vo = 0
|
| 124 |
+
'''
|
| 125 |
+
def __init__(self, in_dims, out_dims, h_dim=None, vector_gate=False,
|
| 126 |
+
activations=(F.relu, torch.sigmoid), tuple_io=True,
|
| 127 |
+
eps=1e-8):
|
| 128 |
+
super(GVP, self).__init__()
|
| 129 |
+
self.si, self.vi = in_dims
|
| 130 |
+
self.so, self.vo = out_dims
|
| 131 |
+
self.tuple_io = tuple_io
|
| 132 |
+
if self.vi:
|
| 133 |
+
self.h_dim = h_dim or max(self.vi, self.vo)
|
| 134 |
+
self.wh = nn.Linear(self.vi, self.h_dim, bias=False)
|
| 135 |
+
self.ws = nn.Linear(self.h_dim + self.si, self.so)
|
| 136 |
+
if self.vo:
|
| 137 |
+
self.wv = nn.Linear(self.h_dim, self.vo, bias=False)
|
| 138 |
+
if vector_gate:
|
| 139 |
+
self.wg = nn.Linear(self.so, self.vo)
|
| 140 |
+
else:
|
| 141 |
+
self.ws = nn.Linear(self.si, self.so)
|
| 142 |
+
|
| 143 |
+
self.vector_gate = vector_gate
|
| 144 |
+
self.scalar_act, self.vector_act = activations
|
| 145 |
+
self.eps = eps
|
| 146 |
+
|
| 147 |
+
def forward(self, x):
|
| 148 |
+
'''
|
| 149 |
+
:param x: tuple (s, V) of `torch.Tensor`,
|
| 150 |
+
or (if vectors_in is 0), a single `torch.Tensor`
|
| 151 |
+
:return: tuple (s, V) of `torch.Tensor`,
|
| 152 |
+
or (if vectors_out is 0), a single `torch.Tensor`
|
| 153 |
+
'''
|
| 154 |
+
if self.vi:
|
| 155 |
+
s, v = x
|
| 156 |
+
v = torch.transpose(v, -1, -2)
|
| 157 |
+
vh = self.wh(v)
|
| 158 |
+
vn = _norm_no_nan(vh, axis=-2, eps=self.eps)
|
| 159 |
+
s = self.ws(torch.cat([s, vn], -1))
|
| 160 |
+
if self.scalar_act:
|
| 161 |
+
s = self.scalar_act(s)
|
| 162 |
+
if self.vo:
|
| 163 |
+
v = self.wv(vh)
|
| 164 |
+
v = torch.transpose(v, -1, -2)
|
| 165 |
+
if self.vector_gate:
|
| 166 |
+
g = self.wg(s).unsqueeze(-1)
|
| 167 |
+
else:
|
| 168 |
+
g = _norm_no_nan(v, axis=-1, keepdims=True, eps=self.eps)
|
| 169 |
+
if self.vector_act:
|
| 170 |
+
g = self.vector_act(g)
|
| 171 |
+
v = v * g
|
| 172 |
+
else:
|
| 173 |
+
if self.tuple_io:
|
| 174 |
+
assert x[1] is None
|
| 175 |
+
x = x[0]
|
| 176 |
+
s = self.ws(x)
|
| 177 |
+
if self.scalar_act:
|
| 178 |
+
s = self.scalar_act(s)
|
| 179 |
+
if self.vo:
|
| 180 |
+
v = torch.zeros(list(s.shape)[:-1] + [self.vo, 3],
|
| 181 |
+
device=s.device)
|
| 182 |
+
|
| 183 |
+
if self.vo:
|
| 184 |
+
return (s, v)
|
| 185 |
+
elif self.tuple_io:
|
| 186 |
+
return (s, None)
|
| 187 |
+
else:
|
| 188 |
+
return s
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
class _VDropout(nn.Module):
|
| 192 |
+
'''
|
| 193 |
+
Vector channel dropout where the elements of each
|
| 194 |
+
vector channel are dropped together.
|
| 195 |
+
'''
|
| 196 |
+
def __init__(self, drop_rate):
|
| 197 |
+
super(_VDropout, self).__init__()
|
| 198 |
+
self.drop_rate = drop_rate
|
| 199 |
+
|
| 200 |
+
def forward(self, x):
|
| 201 |
+
'''
|
| 202 |
+
:param x: `torch.Tensor` corresponding to vector channels
|
| 203 |
+
'''
|
| 204 |
+
if x is None:
|
| 205 |
+
return None
|
| 206 |
+
device = x.device
|
| 207 |
+
if not self.training:
|
| 208 |
+
return x
|
| 209 |
+
mask = torch.bernoulli(
|
| 210 |
+
(1 - self.drop_rate) * torch.ones(x.shape[:-1], device=device)
|
| 211 |
+
).unsqueeze(-1)
|
| 212 |
+
x = mask * x / (1 - self.drop_rate)
|
| 213 |
+
return x
|
| 214 |
+
|
| 215 |
+
class Dropout(nn.Module):
|
| 216 |
+
'''
|
| 217 |
+
Combined dropout for tuples (s, V).
|
| 218 |
+
Takes tuples (s, V) as input and as output.
|
| 219 |
+
'''
|
| 220 |
+
def __init__(self, drop_rate):
|
| 221 |
+
super(Dropout, self).__init__()
|
| 222 |
+
self.sdropout = nn.Dropout(drop_rate)
|
| 223 |
+
self.vdropout = _VDropout(drop_rate)
|
| 224 |
+
|
| 225 |
+
def forward(self, x):
|
| 226 |
+
'''
|
| 227 |
+
:param x: tuple (s, V) of `torch.Tensor`,
|
| 228 |
+
or single `torch.Tensor`
|
| 229 |
+
(will be assumed to be scalar channels)
|
| 230 |
+
'''
|
| 231 |
+
if type(x) is torch.Tensor:
|
| 232 |
+
return self.sdropout(x)
|
| 233 |
+
s, v = x
|
| 234 |
+
return self.sdropout(s), self.vdropout(v)
|
| 235 |
+
|
| 236 |
+
class LayerNorm(nn.Module):
|
| 237 |
+
'''
|
| 238 |
+
Combined LayerNorm for tuples (s, V).
|
| 239 |
+
Takes tuples (s, V) as input and as output.
|
| 240 |
+
'''
|
| 241 |
+
def __init__(self, dims, tuple_io=True, eps=1e-8):
|
| 242 |
+
super(LayerNorm, self).__init__()
|
| 243 |
+
self.tuple_io = tuple_io
|
| 244 |
+
self.s, self.v = dims
|
| 245 |
+
self.scalar_norm = nn.LayerNorm(self.s)
|
| 246 |
+
self.eps = eps
|
| 247 |
+
|
| 248 |
+
def forward(self, x):
|
| 249 |
+
'''
|
| 250 |
+
:param x: tuple (s, V) of `torch.Tensor`,
|
| 251 |
+
or single `torch.Tensor`
|
| 252 |
+
(will be assumed to be scalar channels)
|
| 253 |
+
'''
|
| 254 |
+
if not self.v:
|
| 255 |
+
if self.tuple_io:
|
| 256 |
+
return self.scalar_norm(x[0]), None
|
| 257 |
+
return self.scalar_norm(x)
|
| 258 |
+
s, v = x
|
| 259 |
+
vn = _norm_no_nan(v, axis=-1, keepdims=True, sqrt=False, eps=self.eps)
|
| 260 |
+
nonzero_mask = (vn > 2 * self.eps)
|
| 261 |
+
vn = torch.sum(vn * nonzero_mask, dim=-2, keepdim=True
|
| 262 |
+
) / (self.eps + torch.sum(nonzero_mask, dim=-2, keepdim=True))
|
| 263 |
+
vn = torch.sqrt(vn + self.eps)
|
| 264 |
+
v = nonzero_mask * (v / vn)
|
| 265 |
+
return self.scalar_norm(s), v
|
| 266 |
+
|
| 267 |
+
class GVPConv(MessagePassing):
|
| 268 |
+
'''
|
| 269 |
+
Graph convolution / message passing with Geometric Vector Perceptrons.
|
| 270 |
+
Takes in a graph with node and edge embeddings,
|
| 271 |
+
and returns new node embeddings.
|
| 272 |
+
|
| 273 |
+
This does NOT do residual updates and pointwise feedforward layers
|
| 274 |
+
---see `GVPConvLayer`.
|
| 275 |
+
|
| 276 |
+
:param in_dims: input node embedding dimensions (n_scalar, n_vector)
|
| 277 |
+
:param out_dims: output node embedding dimensions (n_scalar, n_vector)
|
| 278 |
+
:param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
|
| 279 |
+
:param n_layers: number of GVPs in the message function
|
| 280 |
+
:param module_list: preconstructed message function, overrides n_layers
|
| 281 |
+
:param aggr: should be "add" if some incoming edges are masked, as in
|
| 282 |
+
a masked autoregressive decoder architecture
|
| 283 |
+
'''
|
| 284 |
+
def __init__(self, in_dims, out_dims, edge_dims, n_layers=3,
|
| 285 |
+
vector_gate=False, module_list=None, aggr="mean", eps=1e-8,
|
| 286 |
+
activations=(F.relu, torch.sigmoid)):
|
| 287 |
+
super(GVPConv, self).__init__(aggr=aggr)
|
| 288 |
+
self.eps = eps
|
| 289 |
+
self.si, self.vi = in_dims
|
| 290 |
+
self.so, self.vo = out_dims
|
| 291 |
+
self.se, self.ve = edge_dims
|
| 292 |
+
|
| 293 |
+
module_list = module_list or []
|
| 294 |
+
if not module_list:
|
| 295 |
+
if n_layers == 1:
|
| 296 |
+
module_list.append(
|
| 297 |
+
GVP((2*self.si + self.se, 2*self.vi + self.ve),
|
| 298 |
+
(self.so, self.vo), activations=(None, None)))
|
| 299 |
+
else:
|
| 300 |
+
module_list.append(
|
| 301 |
+
GVP((2*self.si + self.se, 2*self.vi + self.ve), out_dims,
|
| 302 |
+
vector_gate=vector_gate, activations=activations)
|
| 303 |
+
)
|
| 304 |
+
for i in range(n_layers - 2):
|
| 305 |
+
module_list.append(GVP(out_dims, out_dims,
|
| 306 |
+
vector_gate=vector_gate))
|
| 307 |
+
module_list.append(GVP(out_dims, out_dims,
|
| 308 |
+
activations=(None, None)))
|
| 309 |
+
self.message_func = nn.Sequential(*module_list)
|
| 310 |
+
|
| 311 |
+
def forward(self, x, edge_index, edge_attr):
|
| 312 |
+
'''
|
| 313 |
+
:param x: tuple (s, V) of `torch.Tensor`
|
| 314 |
+
:param edge_index: array of shape [2, n_edges]
|
| 315 |
+
:param edge_attr: tuple (s, V) of `torch.Tensor`
|
| 316 |
+
'''
|
| 317 |
+
x_s, x_v = x
|
| 318 |
+
message = self.propagate(edge_index,
|
| 319 |
+
s=x_s, v=x_v.reshape(x_v.shape[0], 3*x_v.shape[1]),
|
| 320 |
+
edge_attr=edge_attr)
|
| 321 |
+
return _split(message, self.vo)
|
| 322 |
+
|
| 323 |
+
def message(self, s_i, v_i, s_j, v_j, edge_attr):
|
| 324 |
+
v_j = v_j.view(v_j.shape[0], v_j.shape[1]//3, 3)
|
| 325 |
+
v_i = v_i.view(v_i.shape[0], v_i.shape[1]//3, 3)
|
| 326 |
+
message = tuple_cat((s_j, v_j), edge_attr, (s_i, v_i))
|
| 327 |
+
message = self.message_func(message)
|
| 328 |
+
return _merge(*message)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
class GVPConvLayer(nn.Module):
|
| 332 |
+
'''
|
| 333 |
+
Full graph convolution / message passing layer with
|
| 334 |
+
Geometric Vector Perceptrons. Residually updates node embeddings with
|
| 335 |
+
aggregated incoming messages, applies a pointwise feedforward
|
| 336 |
+
network to node embeddings, and returns updated node embeddings.
|
| 337 |
+
|
| 338 |
+
To only compute the aggregated messages, see `GVPConv`.
|
| 339 |
+
|
| 340 |
+
:param node_dims: node embedding dimensions (n_scalar, n_vector)
|
| 341 |
+
:param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
|
| 342 |
+
:param n_message: number of GVPs to use in message function
|
| 343 |
+
:param n_feedforward: number of GVPs to use in feedforward function
|
| 344 |
+
:param drop_rate: drop probability in all dropout layers
|
| 345 |
+
:param autoregressive: if `True`, this `GVPConvLayer` will be used
|
| 346 |
+
with a different set of input node embeddings for messages
|
| 347 |
+
where src >= dst
|
| 348 |
+
'''
|
| 349 |
+
def __init__(self, node_dims, edge_dims, vector_gate=False,
|
| 350 |
+
n_message=3, n_feedforward=2, drop_rate=.1,
|
| 351 |
+
autoregressive=False, attention_heads=0,
|
| 352 |
+
conv_activations=(F.relu, torch.sigmoid),
|
| 353 |
+
n_edge_gvps=0, layernorm=True, eps=1e-8):
|
| 354 |
+
|
| 355 |
+
super(GVPConvLayer, self).__init__()
|
| 356 |
+
if attention_heads == 0:
|
| 357 |
+
self.conv = GVPConv(
|
| 358 |
+
node_dims, node_dims, edge_dims, n_layers=n_message,
|
| 359 |
+
vector_gate=vector_gate,
|
| 360 |
+
aggr="add" if autoregressive else "mean",
|
| 361 |
+
activations=conv_activations,
|
| 362 |
+
eps=eps,
|
| 363 |
+
)
|
| 364 |
+
else:
|
| 365 |
+
raise NotImplementedError
|
| 366 |
+
if layernorm:
|
| 367 |
+
self.norm = nn.ModuleList([LayerNorm(node_dims, eps=eps) for _ in range(2)])
|
| 368 |
+
else:
|
| 369 |
+
self.norm = nn.ModuleList([nn.Identity() for _ in range(2)])
|
| 370 |
+
self.dropout = nn.ModuleList([Dropout(drop_rate) for _ in range(2)])
|
| 371 |
+
|
| 372 |
+
ff_func = []
|
| 373 |
+
if n_feedforward == 1:
|
| 374 |
+
ff_func.append(GVP(node_dims, node_dims, activations=(None, None)))
|
| 375 |
+
else:
|
| 376 |
+
hid_dims = 4*node_dims[0], 2*node_dims[1]
|
| 377 |
+
ff_func.append(GVP(node_dims, hid_dims, vector_gate=vector_gate))
|
| 378 |
+
for i in range(n_feedforward-2):
|
| 379 |
+
ff_func.append(GVP(hid_dims, hid_dims, vector_gate=vector_gate))
|
| 380 |
+
ff_func.append(GVP(hid_dims, node_dims, activations=(None, None)))
|
| 381 |
+
self.ff_func = nn.Sequential(*ff_func)
|
| 382 |
+
|
| 383 |
+
self.edge_message_func = None
|
| 384 |
+
if n_edge_gvps > 0:
|
| 385 |
+
si, vi = node_dims
|
| 386 |
+
se, ve = edge_dims
|
| 387 |
+
module_list = [
|
| 388 |
+
GVP((2*si + se, 2*vi + ve), edge_dims, vector_gate=vector_gate)
|
| 389 |
+
]
|
| 390 |
+
for i in range(n_edge_gvps - 2):
|
| 391 |
+
module_list.append(GVP(edge_dims, edge_dims,
|
| 392 |
+
vector_gate=vector_gate))
|
| 393 |
+
if n_edge_gvps > 1:
|
| 394 |
+
module_list.append(GVP(edge_dims, edge_dims,
|
| 395 |
+
activations=(None, None)))
|
| 396 |
+
self.edge_message_func = nn.Sequential(*module_list)
|
| 397 |
+
if layernorm:
|
| 398 |
+
self.edge_norm = LayerNorm(edge_dims, eps=eps)
|
| 399 |
+
else:
|
| 400 |
+
self.edge_norm = nn.Identity()
|
| 401 |
+
self.edge_dropout = Dropout(drop_rate)
|
| 402 |
+
|
| 403 |
+
def forward(self, x, edge_index, edge_attr,
|
| 404 |
+
autoregressive_x=None, node_mask=None):
|
| 405 |
+
'''
|
| 406 |
+
:param x: tuple (s, V) of `torch.Tensor`
|
| 407 |
+
:param edge_index: array of shape [2, n_edges]
|
| 408 |
+
:param edge_attr: tuple (s, V) of `torch.Tensor`
|
| 409 |
+
:param autoregressive_x: tuple (s, V) of `torch.Tensor`.
|
| 410 |
+
If not `None`, will be used as srcqq node embeddings
|
| 411 |
+
for forming messages where src >= dst. The corrent node
|
| 412 |
+
embeddings `x` will still be the base of the update and the
|
| 413 |
+
pointwise feedforward.
|
| 414 |
+
:param node_mask: array of type `bool` to index into the first
|
| 415 |
+
dim of node embeddings (s, V). If not `None`, only
|
| 416 |
+
these nodes will be updated.
|
| 417 |
+
'''
|
| 418 |
+
if self.edge_message_func:
|
| 419 |
+
src, dst = edge_index
|
| 420 |
+
if autoregressive_x is None:
|
| 421 |
+
x_src = x[0][src], x[1][src]
|
| 422 |
+
else:
|
| 423 |
+
mask = (src < dst).unsqueeze(-1)
|
| 424 |
+
x_src = (
|
| 425 |
+
torch.where(mask, x[0][src], autoregressive_x[0][src]),
|
| 426 |
+
torch.where(mask.unsqueeze(-1), x[1][src],
|
| 427 |
+
autoregressive_x[1][src])
|
| 428 |
+
)
|
| 429 |
+
x_dst = x[0][dst], x[1][dst]
|
| 430 |
+
x_edge = (
|
| 431 |
+
torch.cat([x_src[0], edge_attr[0], x_dst[0]], dim=-1),
|
| 432 |
+
torch.cat([x_src[1], edge_attr[1], x_dst[1]], dim=-2)
|
| 433 |
+
)
|
| 434 |
+
edge_attr_dh = self.edge_message_func(x_edge)
|
| 435 |
+
edge_attr = self.edge_norm(tuple_sum(edge_attr,
|
| 436 |
+
self.edge_dropout(edge_attr_dh)))
|
| 437 |
+
|
| 438 |
+
if autoregressive_x is not None:
|
| 439 |
+
# Guarding this import here to remove the dependency on torch_scatter, since this isn't used
|
| 440 |
+
# in ESM-IF1
|
| 441 |
+
from torch_scatter import scatter_add
|
| 442 |
+
src, dst = edge_index
|
| 443 |
+
mask = src < dst
|
| 444 |
+
edge_index_forward = edge_index[:, mask]
|
| 445 |
+
edge_index_backward = edge_index[:, ~mask]
|
| 446 |
+
edge_attr_forward = tuple_index(edge_attr, mask)
|
| 447 |
+
edge_attr_backward = tuple_index(edge_attr, ~mask)
|
| 448 |
+
|
| 449 |
+
dh = tuple_sum(
|
| 450 |
+
self.conv(x, edge_index_forward, edge_attr_forward),
|
| 451 |
+
self.conv(autoregressive_x, edge_index_backward, edge_attr_backward)
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
count = scatter_add(torch.ones_like(dst), dst,
|
| 455 |
+
dim_size=dh[0].size(0)).clamp(min=1).unsqueeze(-1)
|
| 456 |
+
|
| 457 |
+
dh = dh[0] / count, dh[1] / count.unsqueeze(-1)
|
| 458 |
+
|
| 459 |
+
else:
|
| 460 |
+
dh = self.conv(x, edge_index, edge_attr)
|
| 461 |
+
|
| 462 |
+
if node_mask is not None:
|
| 463 |
+
x_ = x
|
| 464 |
+
x, dh = tuple_index(x, node_mask), tuple_index(dh, node_mask)
|
| 465 |
+
|
| 466 |
+
x = self.norm[0](tuple_sum(x, self.dropout[0](dh)))
|
| 467 |
+
|
| 468 |
+
dh = self.ff_func(x)
|
| 469 |
+
x = self.norm[1](tuple_sum(x, self.dropout[1](dh)))
|
| 470 |
+
|
| 471 |
+
if node_mask is not None:
|
| 472 |
+
x_[0][node_mask], x_[1][node_mask] = x[0], x[1]
|
| 473 |
+
x = x_
|
| 474 |
+
|
| 475 |
+
return x, edge_attr
|
esm/source/esm/inverse_folding/gvp_transformer.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
from typing import Any, Dict, List, Optional, Tuple, NamedTuple
|
| 8 |
+
import torch
|
| 9 |
+
from torch import nn
|
| 10 |
+
from torch import Tensor
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
from scipy.spatial import transform
|
| 13 |
+
|
| 14 |
+
from esm.data import Alphabet
|
| 15 |
+
|
| 16 |
+
from .features import DihedralFeatures
|
| 17 |
+
from .gvp_encoder import GVPEncoder
|
| 18 |
+
from .gvp_utils import unflatten_graph
|
| 19 |
+
from .gvp_transformer_encoder import GVPTransformerEncoder
|
| 20 |
+
from .transformer_decoder import TransformerDecoder
|
| 21 |
+
from .util import rotate, CoordBatchConverter
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class GVPTransformerModel(nn.Module):
|
| 25 |
+
"""
|
| 26 |
+
GVP-Transformer inverse folding model.
|
| 27 |
+
|
| 28 |
+
Architecture: Geometric GVP-GNN as initial layers, followed by
|
| 29 |
+
sequence-to-sequence Transformer encoder and decoder.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, args, alphabet):
|
| 33 |
+
super().__init__()
|
| 34 |
+
encoder_embed_tokens = self.build_embedding(
|
| 35 |
+
args, alphabet, args.encoder_embed_dim,
|
| 36 |
+
)
|
| 37 |
+
decoder_embed_tokens = self.build_embedding(
|
| 38 |
+
args, alphabet, args.decoder_embed_dim,
|
| 39 |
+
)
|
| 40 |
+
encoder = self.build_encoder(args, alphabet, encoder_embed_tokens)
|
| 41 |
+
decoder = self.build_decoder(args, alphabet, decoder_embed_tokens)
|
| 42 |
+
self.args = args
|
| 43 |
+
self.encoder = encoder
|
| 44 |
+
self.decoder = decoder
|
| 45 |
+
|
| 46 |
+
@classmethod
|
| 47 |
+
def build_encoder(cls, args, src_dict, embed_tokens):
|
| 48 |
+
encoder = GVPTransformerEncoder(args, src_dict, embed_tokens)
|
| 49 |
+
return encoder
|
| 50 |
+
|
| 51 |
+
@classmethod
|
| 52 |
+
def build_decoder(cls, args, tgt_dict, embed_tokens):
|
| 53 |
+
decoder = TransformerDecoder(
|
| 54 |
+
args,
|
| 55 |
+
tgt_dict,
|
| 56 |
+
embed_tokens,
|
| 57 |
+
)
|
| 58 |
+
return decoder
|
| 59 |
+
|
| 60 |
+
@classmethod
|
| 61 |
+
def build_embedding(cls, args, dictionary, embed_dim):
|
| 62 |
+
num_embeddings = len(dictionary)
|
| 63 |
+
padding_idx = dictionary.padding_idx
|
| 64 |
+
emb = nn.Embedding(num_embeddings, embed_dim, padding_idx)
|
| 65 |
+
nn.init.normal_(emb.weight, mean=0, std=embed_dim ** -0.5)
|
| 66 |
+
nn.init.constant_(emb.weight[padding_idx], 0)
|
| 67 |
+
return emb
|
| 68 |
+
|
| 69 |
+
def forward(
|
| 70 |
+
self,
|
| 71 |
+
coords,
|
| 72 |
+
padding_mask,
|
| 73 |
+
confidence,
|
| 74 |
+
prev_output_tokens,
|
| 75 |
+
return_all_hiddens: bool = False,
|
| 76 |
+
features_only: bool = False,
|
| 77 |
+
):
|
| 78 |
+
encoder_out = self.encoder(coords, padding_mask, confidence,
|
| 79 |
+
return_all_hiddens=return_all_hiddens)
|
| 80 |
+
logits, extra = self.decoder(
|
| 81 |
+
prev_output_tokens,
|
| 82 |
+
encoder_out=encoder_out,
|
| 83 |
+
features_only=features_only,
|
| 84 |
+
return_all_hiddens=return_all_hiddens,
|
| 85 |
+
)
|
| 86 |
+
return logits, extra
|
| 87 |
+
|
| 88 |
+
def sample(self, coords, partial_seq=None, temperature=1.0, confidence=None, device=None):
|
| 89 |
+
"""
|
| 90 |
+
Samples sequences based on multinomial sampling (no beam search).
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
coords: L x 3 x 3 list representing one backbone
|
| 94 |
+
partial_seq: Optional, partial sequence with mask tokens if part of
|
| 95 |
+
the sequence is known
|
| 96 |
+
temperature: sampling temperature, use low temperature for higher
|
| 97 |
+
sequence recovery and high temperature for higher diversity
|
| 98 |
+
confidence: optional length L list of confidence scores for coordinates
|
| 99 |
+
"""
|
| 100 |
+
L = len(coords)
|
| 101 |
+
# Convert to batch format
|
| 102 |
+
batch_converter = CoordBatchConverter(self.decoder.dictionary)
|
| 103 |
+
batch_coords, confidence, _, _, padding_mask = (
|
| 104 |
+
batch_converter([(coords, confidence, None)], device=device)
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Start with prepend token
|
| 108 |
+
mask_idx = self.decoder.dictionary.get_idx('<mask>')
|
| 109 |
+
sampled_tokens = torch.full((1, 1+L), mask_idx, dtype=int)
|
| 110 |
+
sampled_tokens[0, 0] = self.decoder.dictionary.get_idx('<cath>')
|
| 111 |
+
if partial_seq is not None:
|
| 112 |
+
for i, c in enumerate(partial_seq):
|
| 113 |
+
sampled_tokens[0, i+1] = self.decoder.dictionary.get_idx(c)
|
| 114 |
+
|
| 115 |
+
# Save incremental states for faster sampling
|
| 116 |
+
incremental_state = dict()
|
| 117 |
+
|
| 118 |
+
# Run encoder only once
|
| 119 |
+
encoder_out = self.encoder(batch_coords, padding_mask, confidence)
|
| 120 |
+
|
| 121 |
+
# Make sure all tensors are on the same device if a GPU is present
|
| 122 |
+
if device:
|
| 123 |
+
sampled_tokens = sampled_tokens.to(device)
|
| 124 |
+
|
| 125 |
+
# Decode one token at a time
|
| 126 |
+
for i in range(1, L+1):
|
| 127 |
+
logits, _ = self.decoder(
|
| 128 |
+
sampled_tokens[:, :i],
|
| 129 |
+
encoder_out,
|
| 130 |
+
incremental_state=incremental_state,
|
| 131 |
+
)
|
| 132 |
+
logits = logits[0].transpose(0, 1)
|
| 133 |
+
logits /= temperature
|
| 134 |
+
probs = F.softmax(logits, dim=-1)
|
| 135 |
+
if sampled_tokens[0, i] == mask_idx:
|
| 136 |
+
sampled_tokens[:, i] = torch.multinomial(probs, 1).squeeze(-1)
|
| 137 |
+
sampled_seq = sampled_tokens[0, 1:]
|
| 138 |
+
|
| 139 |
+
# Convert back to string via lookup
|
| 140 |
+
return ''.join([self.decoder.dictionary.get_tok(a) for a in sampled_seq])
|
esm/source/esm/inverse_folding/gvp_transformer_encoder.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# Contents of this file were adapted from the open source fairseq repository.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the MIT license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
|
| 8 |
+
import argparse
|
| 9 |
+
import math
|
| 10 |
+
from typing import Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
import torch.nn as nn
|
| 14 |
+
from torch import Tensor
|
| 15 |
+
|
| 16 |
+
from esm.modules import SinusoidalPositionalEmbedding
|
| 17 |
+
from .features import GVPInputFeaturizer, DihedralFeatures
|
| 18 |
+
from .gvp_encoder import GVPEncoder
|
| 19 |
+
from .transformer_layer import TransformerEncoderLayer
|
| 20 |
+
from .util import nan_to_num, get_rotation_frames, rotate, rbf
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class GVPTransformerEncoder(nn.Module):
|
| 24 |
+
"""
|
| 25 |
+
Transformer encoder consisting of *args.encoder.layers* layers. Each layer
|
| 26 |
+
is a :class:`TransformerEncoderLayer`.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
args (argparse.Namespace): parsed command-line arguments
|
| 30 |
+
dictionary (~fairseq.data.Dictionary): encoding dictionary
|
| 31 |
+
embed_tokens (torch.nn.Embedding): input embedding
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(self, args, dictionary, embed_tokens):
|
| 35 |
+
super().__init__()
|
| 36 |
+
self.args = args
|
| 37 |
+
self.dictionary = dictionary
|
| 38 |
+
|
| 39 |
+
self.dropout_module = nn.Dropout(args.dropout)
|
| 40 |
+
|
| 41 |
+
embed_dim = embed_tokens.embedding_dim
|
| 42 |
+
self.padding_idx = embed_tokens.padding_idx
|
| 43 |
+
|
| 44 |
+
self.embed_tokens = embed_tokens
|
| 45 |
+
self.embed_scale = math.sqrt(embed_dim)
|
| 46 |
+
self.embed_positions = SinusoidalPositionalEmbedding(
|
| 47 |
+
embed_dim,
|
| 48 |
+
self.padding_idx,
|
| 49 |
+
)
|
| 50 |
+
self.embed_gvp_input_features = nn.Linear(15, embed_dim)
|
| 51 |
+
self.embed_confidence = nn.Linear(16, embed_dim)
|
| 52 |
+
self.embed_dihedrals = DihedralFeatures(embed_dim)
|
| 53 |
+
|
| 54 |
+
gvp_args = argparse.Namespace()
|
| 55 |
+
for k, v in vars(args).items():
|
| 56 |
+
if k.startswith("gvp_"):
|
| 57 |
+
setattr(gvp_args, k[4:], v)
|
| 58 |
+
self.gvp_encoder = GVPEncoder(gvp_args)
|
| 59 |
+
gvp_out_dim = gvp_args.node_hidden_dim_scalar + (3 *
|
| 60 |
+
gvp_args.node_hidden_dim_vector)
|
| 61 |
+
self.embed_gvp_output = nn.Linear(gvp_out_dim, embed_dim)
|
| 62 |
+
|
| 63 |
+
self.layers = nn.ModuleList([])
|
| 64 |
+
self.layers.extend(
|
| 65 |
+
[self.build_encoder_layer(args) for i in range(args.encoder_layers)]
|
| 66 |
+
)
|
| 67 |
+
self.num_layers = len(self.layers)
|
| 68 |
+
self.layer_norm = nn.LayerNorm(embed_dim)
|
| 69 |
+
|
| 70 |
+
def build_encoder_layer(self, args):
|
| 71 |
+
return TransformerEncoderLayer(args)
|
| 72 |
+
|
| 73 |
+
def forward_embedding(self, coords, padding_mask, confidence):
|
| 74 |
+
"""
|
| 75 |
+
Args:
|
| 76 |
+
coords: N, CA, C backbone coordinates in shape length x 3 (atoms) x 3
|
| 77 |
+
padding_mask: boolean Tensor (true for padding) of shape length
|
| 78 |
+
confidence: confidence scores between 0 and 1 of shape length
|
| 79 |
+
"""
|
| 80 |
+
components = dict()
|
| 81 |
+
coord_mask = torch.all(torch.all(torch.isfinite(coords), dim=-1), dim=-1)
|
| 82 |
+
coords = nan_to_num(coords)
|
| 83 |
+
mask_tokens = (
|
| 84 |
+
padding_mask * self.dictionary.padding_idx +
|
| 85 |
+
~padding_mask * self.dictionary.get_idx("<mask>")
|
| 86 |
+
)
|
| 87 |
+
components["tokens"] = self.embed_tokens(mask_tokens) * self.embed_scale
|
| 88 |
+
components["diherals"] = self.embed_dihedrals(coords)
|
| 89 |
+
|
| 90 |
+
# GVP encoder
|
| 91 |
+
gvp_out_scalars, gvp_out_vectors = self.gvp_encoder(coords,
|
| 92 |
+
coord_mask, padding_mask, confidence)
|
| 93 |
+
R = get_rotation_frames(coords)
|
| 94 |
+
# Rotate to local rotation frame for rotation-invariance
|
| 95 |
+
gvp_out_features = torch.cat([
|
| 96 |
+
gvp_out_scalars,
|
| 97 |
+
rotate(gvp_out_vectors, R.transpose(-2, -1)).flatten(-2, -1),
|
| 98 |
+
], dim=-1)
|
| 99 |
+
components["gvp_out"] = self.embed_gvp_output(gvp_out_features)
|
| 100 |
+
|
| 101 |
+
components["confidence"] = self.embed_confidence(
|
| 102 |
+
rbf(confidence, 0., 1.))
|
| 103 |
+
|
| 104 |
+
# In addition to GVP encoder outputs, also directly embed GVP input node
|
| 105 |
+
# features to the Transformer
|
| 106 |
+
scalar_features, vector_features = GVPInputFeaturizer.get_node_features(
|
| 107 |
+
coords, coord_mask, with_coord_mask=False)
|
| 108 |
+
features = torch.cat([
|
| 109 |
+
scalar_features,
|
| 110 |
+
rotate(vector_features, R.transpose(-2, -1)).flatten(-2, -1),
|
| 111 |
+
], dim=-1)
|
| 112 |
+
components["gvp_input_features"] = self.embed_gvp_input_features(features)
|
| 113 |
+
|
| 114 |
+
embed = sum(components.values())
|
| 115 |
+
# for k, v in components.items():
|
| 116 |
+
# print(k, torch.mean(v, dim=(0,1)), torch.std(v, dim=(0,1)))
|
| 117 |
+
|
| 118 |
+
x = embed
|
| 119 |
+
x = x + self.embed_positions(mask_tokens)
|
| 120 |
+
x = self.dropout_module(x)
|
| 121 |
+
return x, components
|
| 122 |
+
|
| 123 |
+
def forward(
|
| 124 |
+
self,
|
| 125 |
+
coords,
|
| 126 |
+
encoder_padding_mask,
|
| 127 |
+
confidence,
|
| 128 |
+
return_all_hiddens: bool = False,
|
| 129 |
+
):
|
| 130 |
+
"""
|
| 131 |
+
Args:
|
| 132 |
+
coords (Tensor): backbone coordinates
|
| 133 |
+
shape batch_size x num_residues x num_atoms (3 for N, CA, C) x 3
|
| 134 |
+
encoder_padding_mask (ByteTensor): the positions of
|
| 135 |
+
padding elements of shape `(batch_size x num_residues)`
|
| 136 |
+
confidence (Tensor): the confidence score of shape (batch_size x
|
| 137 |
+
num_residues). The value is between 0. and 1. for each residue
|
| 138 |
+
coordinate, or -1. if no coordinate is given
|
| 139 |
+
return_all_hiddens (bool, optional): also return all of the
|
| 140 |
+
intermediate hidden states (default: False).
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
dict:
|
| 144 |
+
- **encoder_out** (Tensor): the last encoder layer's output of
|
| 145 |
+
shape `(num_residues, batch_size, embed_dim)`
|
| 146 |
+
- **encoder_padding_mask** (ByteTensor): the positions of
|
| 147 |
+
padding elements of shape `(batch_size, num_residues)`
|
| 148 |
+
- **encoder_embedding** (Tensor): the (scaled) embedding lookup
|
| 149 |
+
of shape `(batch_size, num_residues, embed_dim)`
|
| 150 |
+
- **encoder_states** (List[Tensor]): all intermediate
|
| 151 |
+
hidden states of shape `(num_residues, batch_size, embed_dim)`.
|
| 152 |
+
Only populated if *return_all_hiddens* is True.
|
| 153 |
+
"""
|
| 154 |
+
x, encoder_embedding = self.forward_embedding(coords,
|
| 155 |
+
encoder_padding_mask, confidence)
|
| 156 |
+
# account for padding while computing the representation
|
| 157 |
+
x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x))
|
| 158 |
+
|
| 159 |
+
# B x T x C -> T x B x C
|
| 160 |
+
x = x.transpose(0, 1)
|
| 161 |
+
|
| 162 |
+
encoder_states = []
|
| 163 |
+
|
| 164 |
+
if return_all_hiddens:
|
| 165 |
+
encoder_states.append(x)
|
| 166 |
+
|
| 167 |
+
# encoder layers
|
| 168 |
+
for layer in self.layers:
|
| 169 |
+
x = layer(
|
| 170 |
+
x, encoder_padding_mask=encoder_padding_mask
|
| 171 |
+
)
|
| 172 |
+
if return_all_hiddens:
|
| 173 |
+
assert encoder_states is not None
|
| 174 |
+
encoder_states.append(x)
|
| 175 |
+
|
| 176 |
+
if self.layer_norm is not None:
|
| 177 |
+
x = self.layer_norm(x)
|
| 178 |
+
|
| 179 |
+
return {
|
| 180 |
+
"encoder_out": [x], # T x B x C
|
| 181 |
+
"encoder_padding_mask": [encoder_padding_mask], # B x T
|
| 182 |
+
"encoder_embedding": [encoder_embedding], # dictionary
|
| 183 |
+
"encoder_states": encoder_states, # List[T x B x C]
|
| 184 |
+
}
|
esm/source/esm/inverse_folding/gvp_utils.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def flatten_graph(node_embeddings, edge_embeddings, edge_index):
|
| 10 |
+
"""
|
| 11 |
+
Flattens the graph into a batch size one (with disconnected subgraphs for
|
| 12 |
+
each example) to be compatible with pytorch-geometric package.
|
| 13 |
+
Args:
|
| 14 |
+
node_embeddings: node embeddings in tuple form (scalar, vector)
|
| 15 |
+
- scalar: shape batch size x nodes x node_embed_dim
|
| 16 |
+
- vector: shape batch size x nodes x node_embed_dim x 3
|
| 17 |
+
edge_embeddings: edge embeddings of in tuple form (scalar, vector)
|
| 18 |
+
- scalar: shape batch size x edges x edge_embed_dim
|
| 19 |
+
- vector: shape batch size x edges x edge_embed_dim x 3
|
| 20 |
+
edge_index: shape batch_size x 2 (source node and target node) x edges
|
| 21 |
+
Returns:
|
| 22 |
+
node_embeddings: node embeddings in tuple form (scalar, vector)
|
| 23 |
+
- scalar: shape batch total_nodes x node_embed_dim
|
| 24 |
+
- vector: shape batch total_nodes x node_embed_dim x 3
|
| 25 |
+
edge_embeddings: edge embeddings of in tuple form (scalar, vector)
|
| 26 |
+
- scalar: shape batch total_edges x edge_embed_dim
|
| 27 |
+
- vector: shape batch total_edges x edge_embed_dim x 3
|
| 28 |
+
edge_index: shape 2 x total_edges
|
| 29 |
+
"""
|
| 30 |
+
x_s, x_v = node_embeddings
|
| 31 |
+
e_s, e_v = edge_embeddings
|
| 32 |
+
batch_size, N = x_s.shape[0], x_s.shape[1]
|
| 33 |
+
node_embeddings = (torch.flatten(x_s, 0, 1), torch.flatten(x_v, 0, 1))
|
| 34 |
+
edge_embeddings = (torch.flatten(e_s, 0, 1), torch.flatten(e_v, 0, 1))
|
| 35 |
+
|
| 36 |
+
edge_mask = torch.any(edge_index != -1, dim=1)
|
| 37 |
+
# Re-number the nodes by adding batch_idx * N to each batch
|
| 38 |
+
edge_index = edge_index + (torch.arange(batch_size, device=edge_index.device) *
|
| 39 |
+
N).unsqueeze(-1).unsqueeze(-1)
|
| 40 |
+
edge_index = edge_index.permute(1, 0, 2).flatten(1, 2)
|
| 41 |
+
edge_mask = edge_mask.flatten()
|
| 42 |
+
edge_index = edge_index[:, edge_mask]
|
| 43 |
+
edge_embeddings = (
|
| 44 |
+
edge_embeddings[0][edge_mask, :],
|
| 45 |
+
edge_embeddings[1][edge_mask, :]
|
| 46 |
+
)
|
| 47 |
+
return node_embeddings, edge_embeddings, edge_index
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def unflatten_graph(node_embeddings, batch_size):
|
| 51 |
+
"""
|
| 52 |
+
Unflattens node embeddings.
|
| 53 |
+
Args:
|
| 54 |
+
node_embeddings: node embeddings in tuple form (scalar, vector)
|
| 55 |
+
- scalar: shape batch total_nodes x node_embed_dim
|
| 56 |
+
- vector: shape batch total_nodes x node_embed_dim x 3
|
| 57 |
+
batch_size: int
|
| 58 |
+
Returns:
|
| 59 |
+
node_embeddings: node embeddings in tuple form (scalar, vector)
|
| 60 |
+
- scalar: shape batch size x nodes x node_embed_dim
|
| 61 |
+
- vector: shape batch size x nodes x node_embed_dim x 3
|
| 62 |
+
"""
|
| 63 |
+
x_s, x_v = node_embeddings
|
| 64 |
+
x_s = x_s.reshape(batch_size, -1, x_s.shape[1])
|
| 65 |
+
x_v = x_v.reshape(batch_size, -1, x_v.shape[1], x_v.shape[2])
|
| 66 |
+
return (x_s, x_v)
|
| 67 |
+
|
| 68 |
+
|
esm/source/esm/inverse_folding/multichain_util.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import biotite.structure
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
from typing import Sequence, Tuple, List
|
| 10 |
+
|
| 11 |
+
from esm.inverse_folding.util import (
|
| 12 |
+
load_structure,
|
| 13 |
+
extract_coords_from_structure,
|
| 14 |
+
load_coords,
|
| 15 |
+
get_sequence_loss,
|
| 16 |
+
get_encoder_output,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def extract_coords_from_complex(structure: biotite.structure.AtomArray):
|
| 21 |
+
"""
|
| 22 |
+
Args:
|
| 23 |
+
structure: biotite AtomArray
|
| 24 |
+
Returns:
|
| 25 |
+
Tuple (coords_list, seq_list)
|
| 26 |
+
- coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
|
| 27 |
+
coordinates representing the backbone of each chain
|
| 28 |
+
- seqs: Dictionary mapping chain ids to native sequences of each chain
|
| 29 |
+
"""
|
| 30 |
+
coords = {}
|
| 31 |
+
seqs = {}
|
| 32 |
+
all_chains = biotite.structure.get_chains(structure)
|
| 33 |
+
for chain_id in all_chains:
|
| 34 |
+
chain = structure[structure.chain_id == chain_id]
|
| 35 |
+
coords[chain_id], seqs[chain_id] = extract_coords_from_structure(chain)
|
| 36 |
+
return coords, seqs
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def load_complex_coords(fpath, chains):
|
| 40 |
+
"""
|
| 41 |
+
Args:
|
| 42 |
+
fpath: filepath to either pdb or cif file
|
| 43 |
+
chains: the chain ids (the order matters for autoregressive model)
|
| 44 |
+
Returns:
|
| 45 |
+
Tuple (coords_list, seq_list)
|
| 46 |
+
- coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
|
| 47 |
+
coordinates representing the backbone of each chain
|
| 48 |
+
- seqs: Dictionary mapping chain ids to native sequences of each chain
|
| 49 |
+
"""
|
| 50 |
+
structure = load_structure(fpath, chains)
|
| 51 |
+
return extract_coords_from_complex(structure)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _concatenate_coords(coords, target_chain_id, padding_length=10):
|
| 55 |
+
"""
|
| 56 |
+
Args:
|
| 57 |
+
coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
|
| 58 |
+
coordinates representing the backbone of each chain
|
| 59 |
+
target_chain_id: The chain id to sample sequences for
|
| 60 |
+
padding_length: Length of padding between concatenated chains
|
| 61 |
+
Returns:
|
| 62 |
+
Tuple (coords, seq)
|
| 63 |
+
- coords is an L x 3 x 3 array for N, CA, C coordinates, a
|
| 64 |
+
concatenation of the chains with padding in between
|
| 65 |
+
- seq is the extracted sequence, with padding tokens inserted
|
| 66 |
+
between the concatenated chains
|
| 67 |
+
"""
|
| 68 |
+
pad_coords = np.full((padding_length, 3, 3), np.nan, dtype=np.float32)
|
| 69 |
+
# For best performance, put the target chain first in concatenation.
|
| 70 |
+
coords_list = [coords[target_chain_id]]
|
| 71 |
+
for chain_id in coords:
|
| 72 |
+
if chain_id == target_chain_id:
|
| 73 |
+
continue
|
| 74 |
+
coords_list.append(pad_coords)
|
| 75 |
+
coords_list.append(coords[chain_id])
|
| 76 |
+
coords_concatenated = np.concatenate(coords_list, axis=0)
|
| 77 |
+
return coords_concatenated
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def sample_sequence_in_complex(model, coords, target_chain_id, temperature=1.,
|
| 81 |
+
padding_length=10):
|
| 82 |
+
"""
|
| 83 |
+
Samples sequence for one chain in a complex.
|
| 84 |
+
Args:
|
| 85 |
+
model: An instance of the GVPTransformer model
|
| 86 |
+
coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
|
| 87 |
+
coordinates representing the backbone of each chain
|
| 88 |
+
target_chain_id: The chain id to sample sequences for
|
| 89 |
+
padding_length: padding length in between chains
|
| 90 |
+
Returns:
|
| 91 |
+
Sampled sequence for the target chain
|
| 92 |
+
"""
|
| 93 |
+
target_chain_len = coords[target_chain_id].shape[0]
|
| 94 |
+
all_coords = _concatenate_coords(coords, target_chain_id)
|
| 95 |
+
device = next(model.parameters()).device
|
| 96 |
+
|
| 97 |
+
# Supply padding tokens for other chains to avoid unused sampling for speed
|
| 98 |
+
padding_pattern = ['<pad>'] * all_coords.shape[0]
|
| 99 |
+
for i in range(target_chain_len):
|
| 100 |
+
padding_pattern[i] = '<mask>'
|
| 101 |
+
sampled = model.sample(all_coords, partial_seq=padding_pattern,
|
| 102 |
+
temperature=temperature, device=device)
|
| 103 |
+
sampled = sampled[:target_chain_len]
|
| 104 |
+
return sampled
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def score_sequence_in_complex(model, alphabet, coords, target_chain_id,
|
| 108 |
+
target_seq, padding_length=10):
|
| 109 |
+
"""
|
| 110 |
+
Scores sequence for one chain in a complex.
|
| 111 |
+
Args:
|
| 112 |
+
model: An instance of the GVPTransformer model
|
| 113 |
+
alphabet: Alphabet for the model
|
| 114 |
+
coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
|
| 115 |
+
coordinates representing the backbone of each chain
|
| 116 |
+
target_chain_id: The chain id to sample sequences for
|
| 117 |
+
target_seq: Target sequence for the target chain for scoring.
|
| 118 |
+
padding_length: padding length in between chains
|
| 119 |
+
Returns:
|
| 120 |
+
Tuple (ll_fullseq, ll_withcoord)
|
| 121 |
+
- ll_fullseq: Average log-likelihood over the full target chain
|
| 122 |
+
- ll_withcoord: Average log-likelihood in target chain excluding those
|
| 123 |
+
residues without coordinates
|
| 124 |
+
"""
|
| 125 |
+
all_coords = _concatenate_coords(coords, target_chain_id)
|
| 126 |
+
|
| 127 |
+
loss, target_padding_mask = get_sequence_loss(model, alphabet, all_coords,
|
| 128 |
+
target_seq)
|
| 129 |
+
ll_fullseq = -np.sum(loss * ~target_padding_mask) / np.sum(
|
| 130 |
+
~target_padding_mask)
|
| 131 |
+
|
| 132 |
+
# Also calculate average when excluding masked portions
|
| 133 |
+
coord_mask = np.all(np.isfinite(coords[target_chain_id]), axis=(-1, -2))
|
| 134 |
+
ll_withcoord = -np.sum(loss * coord_mask) / np.sum(coord_mask)
|
| 135 |
+
return ll_fullseq, ll_withcoord
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def get_encoder_output_for_complex(model, alphabet, coords, target_chain_id):
|
| 139 |
+
"""
|
| 140 |
+
Args:
|
| 141 |
+
model: An instance of the GVPTransformer model
|
| 142 |
+
alphabet: Alphabet for the model
|
| 143 |
+
coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
|
| 144 |
+
coordinates representing the backbone of each chain
|
| 145 |
+
target_chain_id: The chain id to sample sequences for
|
| 146 |
+
Returns:
|
| 147 |
+
Dictionary mapping chain id to encoder output for each chain
|
| 148 |
+
"""
|
| 149 |
+
all_coords = _concatenate_coords(coords, target_chain_id)
|
| 150 |
+
all_rep = get_encoder_output(model, alphabet, all_coords)
|
| 151 |
+
target_chain_len = coords[target_chain_id].shape[0]
|
| 152 |
+
return all_rep[:target_chain_len]
|
esm/source/esm/inverse_folding/transformer_decoder.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# Contents of this file were adapted from the open source fairseq repository.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the MIT license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
|
| 8 |
+
import math
|
| 9 |
+
from typing import Any, Dict, List, Optional
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
import torch.nn as nn
|
| 13 |
+
from torch import Tensor
|
| 14 |
+
|
| 15 |
+
from esm.modules import SinusoidalPositionalEmbedding
|
| 16 |
+
from .transformer_layer import TransformerDecoderLayer
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def fill_with_neg_inf(t):
|
| 20 |
+
"""FP16-compatible function that fills a tensor with -inf."""
|
| 21 |
+
return t.float().fill_(float("-inf")).type_as(t)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class TransformerDecoder(nn.Module):
|
| 25 |
+
"""
|
| 26 |
+
Transformer decoder consisting of *args.decoder.layers* layers. Each layer
|
| 27 |
+
is a :class:`TransformerDecoderLayer`.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
args (argparse.Namespace): parsed command-line arguments
|
| 31 |
+
dictionary (~fairseq.data.Dictionary): decoding dictionary
|
| 32 |
+
embed_tokens (torch.nn.Embedding): output embedding
|
| 33 |
+
no_encoder_attn (bool, optional): whether to attend to encoder outputs
|
| 34 |
+
(default: False).
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
args,
|
| 40 |
+
dictionary,
|
| 41 |
+
embed_tokens,
|
| 42 |
+
):
|
| 43 |
+
super().__init__()
|
| 44 |
+
self.args = args
|
| 45 |
+
self.dictionary = dictionary
|
| 46 |
+
self._future_mask = torch.empty(0)
|
| 47 |
+
|
| 48 |
+
self.dropout_module = nn.Dropout(args.dropout)
|
| 49 |
+
|
| 50 |
+
input_embed_dim = embed_tokens.embedding_dim
|
| 51 |
+
embed_dim = args.decoder_embed_dim
|
| 52 |
+
self.embed_dim = embed_dim
|
| 53 |
+
|
| 54 |
+
self.padding_idx = embed_tokens.padding_idx
|
| 55 |
+
|
| 56 |
+
self.embed_tokens = embed_tokens
|
| 57 |
+
self.embed_scale = math.sqrt(embed_dim)
|
| 58 |
+
|
| 59 |
+
self.project_in_dim = (
|
| 60 |
+
nn.Linear(input_embed_dim, embed_dim, bias=False)
|
| 61 |
+
if embed_dim != input_embed_dim
|
| 62 |
+
else None
|
| 63 |
+
)
|
| 64 |
+
self.embed_positions = SinusoidalPositionalEmbedding(
|
| 65 |
+
embed_dim,
|
| 66 |
+
self.padding_idx,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
self.layers = nn.ModuleList([])
|
| 70 |
+
self.layers.extend(
|
| 71 |
+
[
|
| 72 |
+
self.build_decoder_layer(args)
|
| 73 |
+
for _ in range(args.decoder_layers)
|
| 74 |
+
]
|
| 75 |
+
)
|
| 76 |
+
self.num_layers = len(self.layers)
|
| 77 |
+
self.layer_norm = nn.LayerNorm(embed_dim)
|
| 78 |
+
|
| 79 |
+
self.build_output_projection(args, dictionary)
|
| 80 |
+
|
| 81 |
+
def build_output_projection(self, args, dictionary):
|
| 82 |
+
self.output_projection = nn.Linear(
|
| 83 |
+
args.decoder_embed_dim, len(dictionary), bias=False
|
| 84 |
+
)
|
| 85 |
+
nn.init.normal_(
|
| 86 |
+
self.output_projection.weight, mean=0, std=args.decoder_embed_dim ** -0.5
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
def build_decoder_layer(self, args):
|
| 90 |
+
return TransformerDecoderLayer(args)
|
| 91 |
+
|
| 92 |
+
def forward(
|
| 93 |
+
self,
|
| 94 |
+
prev_output_tokens,
|
| 95 |
+
encoder_out: Optional[Dict[str, List[Tensor]]] = None,
|
| 96 |
+
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
|
| 97 |
+
features_only: bool = False,
|
| 98 |
+
return_all_hiddens: bool = False,
|
| 99 |
+
):
|
| 100 |
+
"""
|
| 101 |
+
Args:
|
| 102 |
+
prev_output_tokens (LongTensor): previous decoder outputs of shape
|
| 103 |
+
`(batch, tgt_len)`, for teacher forcing
|
| 104 |
+
encoder_out (optional): output from the encoder, used for
|
| 105 |
+
encoder-side attention, should be of size T x B x C
|
| 106 |
+
incremental_state (dict): dictionary used for storing state during
|
| 107 |
+
:ref:`Incremental decoding`
|
| 108 |
+
features_only (bool, optional): only return features without
|
| 109 |
+
applying output layer (default: False).
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
tuple:
|
| 113 |
+
- the decoder's output of shape `(batch, tgt_len, vocab)`
|
| 114 |
+
- a dictionary with any model-specific outputs
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
x, extra = self.extract_features(
|
| 118 |
+
prev_output_tokens,
|
| 119 |
+
encoder_out=encoder_out,
|
| 120 |
+
incremental_state=incremental_state,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
if not features_only:
|
| 124 |
+
x = self.output_layer(x)
|
| 125 |
+
x = x.transpose(1, 2) # B x T x C -> B x C x T
|
| 126 |
+
return x, extra
|
| 127 |
+
|
| 128 |
+
def extract_features(
|
| 129 |
+
self,
|
| 130 |
+
prev_output_tokens,
|
| 131 |
+
encoder_out: Optional[Dict[str, List[Tensor]]],
|
| 132 |
+
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
|
| 133 |
+
):
|
| 134 |
+
"""
|
| 135 |
+
Similar to *forward* but only return features.
|
| 136 |
+
|
| 137 |
+
Includes several features from "Jointly Learning to Align and
|
| 138 |
+
Translate with Transformer Models" (Garg et al., EMNLP 2019).
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
tuple:
|
| 142 |
+
- the decoder's features of shape `(batch, tgt_len, embed_dim)`
|
| 143 |
+
- a dictionary with any model-specific outputs
|
| 144 |
+
"""
|
| 145 |
+
bs, slen = prev_output_tokens.size()
|
| 146 |
+
|
| 147 |
+
enc: Optional[Tensor] = None
|
| 148 |
+
padding_mask: Optional[Tensor] = None
|
| 149 |
+
if encoder_out is not None and len(encoder_out["encoder_out"]) > 0:
|
| 150 |
+
enc = encoder_out["encoder_out"][0]
|
| 151 |
+
assert (
|
| 152 |
+
enc.size()[1] == bs
|
| 153 |
+
), f"Expected enc.shape == (t, {bs}, c) got {enc.shape}"
|
| 154 |
+
if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0:
|
| 155 |
+
padding_mask = encoder_out["encoder_padding_mask"][0]
|
| 156 |
+
|
| 157 |
+
# embed positions
|
| 158 |
+
positions = self.embed_positions(
|
| 159 |
+
prev_output_tokens
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
if incremental_state is not None:
|
| 163 |
+
prev_output_tokens = prev_output_tokens[:, -1:]
|
| 164 |
+
positions = positions[:, -1:]
|
| 165 |
+
|
| 166 |
+
# embed tokens and positions
|
| 167 |
+
x = self.embed_scale * self.embed_tokens(prev_output_tokens)
|
| 168 |
+
|
| 169 |
+
if self.project_in_dim is not None:
|
| 170 |
+
x = self.project_in_dim(x)
|
| 171 |
+
|
| 172 |
+
x += positions
|
| 173 |
+
|
| 174 |
+
x = self.dropout_module(x)
|
| 175 |
+
|
| 176 |
+
# B x T x C -> T x B x C
|
| 177 |
+
x = x.transpose(0, 1)
|
| 178 |
+
|
| 179 |
+
self_attn_padding_mask: Optional[Tensor] = None
|
| 180 |
+
if prev_output_tokens.eq(self.padding_idx).any():
|
| 181 |
+
self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
|
| 182 |
+
|
| 183 |
+
# decoder layers
|
| 184 |
+
attn: Optional[Tensor] = None
|
| 185 |
+
inner_states: List[Optional[Tensor]] = [x]
|
| 186 |
+
for idx, layer in enumerate(self.layers):
|
| 187 |
+
if incremental_state is None:
|
| 188 |
+
self_attn_mask = self.buffered_future_mask(x)
|
| 189 |
+
else:
|
| 190 |
+
self_attn_mask = None
|
| 191 |
+
|
| 192 |
+
x, layer_attn, _ = layer(
|
| 193 |
+
x,
|
| 194 |
+
enc,
|
| 195 |
+
padding_mask,
|
| 196 |
+
incremental_state,
|
| 197 |
+
self_attn_mask=self_attn_mask,
|
| 198 |
+
self_attn_padding_mask=self_attn_padding_mask,
|
| 199 |
+
need_attn=False,
|
| 200 |
+
need_head_weights=False,
|
| 201 |
+
)
|
| 202 |
+
inner_states.append(x)
|
| 203 |
+
|
| 204 |
+
if self.layer_norm is not None:
|
| 205 |
+
x = self.layer_norm(x)
|
| 206 |
+
|
| 207 |
+
# T x B x C -> B x C x T
|
| 208 |
+
x = x.transpose(0, 1)
|
| 209 |
+
|
| 210 |
+
return x, {"inner_states": inner_states}
|
| 211 |
+
|
| 212 |
+
def output_layer(self, features):
|
| 213 |
+
"""Project features to the vocabulary size."""
|
| 214 |
+
return self.output_projection(features)
|
| 215 |
+
|
| 216 |
+
def buffered_future_mask(self, tensor):
|
| 217 |
+
dim = tensor.size(0)
|
| 218 |
+
# self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
|
| 219 |
+
if (
|
| 220 |
+
self._future_mask.size(0) == 0
|
| 221 |
+
or (not self._future_mask.device == tensor.device)
|
| 222 |
+
or self._future_mask.size(0) < dim
|
| 223 |
+
):
|
| 224 |
+
self._future_mask = torch.triu(
|
| 225 |
+
fill_with_neg_inf(torch.zeros([dim, dim])), 1
|
| 226 |
+
)
|
| 227 |
+
self._future_mask = self._future_mask.to(tensor)
|
| 228 |
+
return self._future_mask[:dim, :dim]
|