kabudadada commited on
Commit
e76b79a
·
1 Parent(s): 6a001dc

Add esm folder and minimal app

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. Dockerfile +12 -0
  3. app.py +17 -0
  4. esm/mcp_output/README_MCP.md +144 -0
  5. esm/mcp_output/analysis.json +163 -0
  6. esm/mcp_output/env_info.json +17 -0
  7. esm/mcp_output/mcp_logs/llm_statistics.json +11 -0
  8. esm/mcp_output/mcp_logs/run_log.json +73 -0
  9. esm/mcp_output/mcp_plugin/__init__.py +0 -0
  10. esm/mcp_output/mcp_plugin/__pycache__/adapter.cpython-310.pyc +0 -0
  11. esm/mcp_output/mcp_plugin/__pycache__/mcp_service.cpython-310.pyc +0 -0
  12. esm/mcp_output/mcp_plugin/adapter.py +423 -0
  13. esm/mcp_output/mcp_plugin/main.py +13 -0
  14. esm/mcp_output/mcp_plugin/mcp_service.py +256 -0
  15. esm/mcp_output/predictions/prediction_20250823_235651.pdb +528 -0
  16. esm/mcp_output/predictions/prediction_20250830_220641.pdb +489 -0
  17. esm/mcp_output/requirements.txt +4 -0
  18. esm/mcp_output/start_mcp.py +34 -0
  19. esm/mcp_output/tests_mcp/test_mcp_basic.py +49 -0
  20. esm/mcp_output/tests_smoke/test_smoke.py +29 -0
  21. esm/source/.flake8 +10 -0
  22. esm/source/.git-blame-ignore-revs +2 -0
  23. esm/source/.github/ISSUE_TEMPLATE/bug.md +27 -0
  24. esm/source/.gitignore +31 -0
  25. esm/source/CODE_OF_CONDUCT.rst +6 -0
  26. esm/source/CONTRIBUTING.md +31 -0
  27. esm/source/LICENSE +21 -0
  28. esm/source/README.md +795 -0
  29. esm/source/__init__.py +4 -0
  30. esm/source/environment.yml +36 -0
  31. esm/source/esm/__init__.py +12 -0
  32. esm/source/esm/axial_attention.py +239 -0
  33. esm/source/esm/constants.py +10 -0
  34. esm/source/esm/data.py +493 -0
  35. esm/source/esm/esmfold/v1/__init__.py +0 -0
  36. esm/source/esm/esmfold/v1/categorical_mixture.py +43 -0
  37. esm/source/esm/esmfold/v1/esmfold.py +364 -0
  38. esm/source/esm/esmfold/v1/misc.py +309 -0
  39. esm/source/esm/esmfold/v1/pretrained.py +181 -0
  40. esm/source/esm/esmfold/v1/tri_self_attn_block.py +160 -0
  41. esm/source/esm/esmfold/v1/trunk.py +243 -0
  42. esm/source/esm/inverse_folding/__init__.py +8 -0
  43. esm/source/esm/inverse_folding/features.py +352 -0
  44. esm/source/esm/inverse_folding/gvp_encoder.py +56 -0
  45. esm/source/esm/inverse_folding/gvp_modules.py +475 -0
  46. esm/source/esm/inverse_folding/gvp_transformer.py +140 -0
  47. esm/source/esm/inverse_folding/gvp_transformer_encoder.py +184 -0
  48. esm/source/esm/inverse_folding/gvp_utils.py +68 -0
  49. esm/source/esm/inverse_folding/multichain_util.py +152 -0
  50. esm/source/esm/inverse_folding/transformer_decoder.py +228 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.p filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV PATH="/home/user/.local/bin:$PATH"
5
+ WORKDIR /app
6
+
7
+ COPY --chown=user ./requirements.txt requirements.txt
8
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
9
+
10
+ COPY --chown=user . /app
11
+ EXPOSE 7860
12
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, WebSocket
2
+
3
+ app = FastAPI()
4
+
5
+
6
+ @app.get("/")
7
+ async def root():
8
+ return {"status": "ok", "service": "Code2MCP-esm"}
9
+
10
+
11
+ @app.websocket("/ws")
12
+ async def websocket_endpoint(ws: WebSocket):
13
+ await ws.accept()
14
+ await ws.send_text("WebSocket is up. Replace with your MCP/ESM handler.")
15
+ await ws.close()
16
+
17
+
esm/mcp_output/README_MCP.md ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ESM: Evolutionary Scale Modeling for Protein Sequences
2
+
3
+ ## Overview
4
+
5
+ `facebookresearch/esm` is an open-source project developed by Facebook AI Research (FAIR) for deep learning-based protein sequence modeling. It provides state-of-the-art tools for analyzing and predicting protein structures, functions, and variant effects using advanced language models and deep learning techniques.
6
+
7
+ ### Key Features
8
+
9
+ - **Protein Language Models**: Pretrained models like ESM-1 and ESM-2 capture semantic information in protein sequences.
10
+ - **Multiple Sequence Alignment (MSA) Modeling**: Tools for protein modeling based on MSA, including MSA Transformer.
11
+ - **Inverse Folding**: Predict how protein sequences fold into 3D structures.
12
+ - **Variant Effect Prediction**: Assess the impact of mutations on protein functionality.
13
+ - **Contact Prediction**: Predict residue-residue contacts in protein sequences.
14
+ - **Metagenomic Analysis**: Analyze environmental protein sequences using the ESM Metagenomic Atlas.
15
+ - **Feature Extraction**: Tools like `esm-extract` for extracting features from pretrained models.
16
+
17
+ This repository is designed for researchers and developers in computational biology, bioinformatics, and related fields.
18
+
19
+ ---
20
+
21
+ ## Installation
22
+
23
+ ### Prerequisites
24
+
25
+ - Python 3.8 or later
26
+ - PyTorch 1.8 or later
27
+ - GPU support (optional but recommended for large-scale computations)
28
+
29
+ ### Installation Steps
30
+
31
+ 1. Clone the repository:
32
+ ```
33
+ git clone https://github.com/facebookresearch/esm.git
34
+ cd esm
35
+ ```
36
+
37
+ 2. Install dependencies:
38
+ ```
39
+ pip install -r requirements.txt
40
+ ```
41
+
42
+ 3. (Optional) Set up a virtual environment:
43
+ ```
44
+ python -m venv esm_env
45
+ source esm_env/bin/activate
46
+ ```
47
+
48
+ 4. Install the package:
49
+ ```
50
+ pip install .
51
+ ```
52
+
53
+ 5. (Optional) Install additional dependencies for specific features:
54
+ ```
55
+ pip install fairscale pandas
56
+ ```
57
+
58
+ ---
59
+
60
+ ## Usage
61
+
62
+ ### Loading Pretrained Models
63
+
64
+ The repository provides pretrained models for various tasks. You can load a model using the following example:
65
+
66
+ ```
67
+ from esm.pretrained import load_model_and_alphabet
68
+ model, alphabet = load_model_and_alphabet("esm2_t33_650M_UR50D")
69
+ ```
70
+
71
+ ### Command-Line Tools
72
+
73
+ The repository includes several command-line tools for common tasks:
74
+
75
+ #### 1. `esm-extract`
76
+ Extract features from protein sequences using pretrained models.
77
+
78
+ **Usage:**
79
+ ```
80
+ esm-extract --model esm2_t33_650M_UR50D --fasta input.fasta --output output.pt
81
+ ```
82
+
83
+ #### 2. `esm-fold`
84
+ Predict the 3D structure of a protein sequence.
85
+
86
+ **Usage:**
87
+ ```
88
+ esm-fold --model esm2_t33_650M_UR50D --fasta input.fasta --output output.pdb
89
+ ```
90
+
91
+ ---
92
+
93
+ ## Available Tools and Endpoints
94
+
95
+ ### Core Modules
96
+
97
+ - **`esm.pretrained`**: Load pretrained models.
98
+ - Functions: `load_model_and_alphabet`, `load_model_and_alphabet_local`
99
+ - **`esm.data`**: Handle protein sequence data.
100
+ - Functions: `Alphabet`, `BatchConverter`
101
+ - **`esm.inverse_folding`**: Tools for inverse folding tasks.
102
+ - Functions: `load_inverse_folding_model`
103
+ - Classes: `GVPTransformerEncoder`, `GVPTransformerDecoder`
104
+ - **`esm.model`**: Core model definitions.
105
+ - Classes: `ESM1`, `ESM2`, `MSATransformer`
106
+
107
+ ### CLI Commands
108
+
109
+ - **`esm-extract`**: Extract features from protein sequences.
110
+ - **`esm-fold`**: Predict protein 3D structures.
111
+
112
+ ---
113
+
114
+ ## Notes and Troubleshooting
115
+
116
+ ### Notes
117
+
118
+ 1. **Model Size**: Pretrained models like ESM-2 are large and may require significant memory. Use a GPU for optimal performance.
119
+ 2. **Dependencies**: Ensure all required dependencies are installed. Optional dependencies like `fairscale` and `pandas` are needed for specific features.
120
+ 3. **Input Formats**: Protein sequences should be provided in FASTA format for most tools.
121
+
122
+ ### Troubleshooting
123
+
124
+ - **Out of Memory Errors**: If you encounter memory issues, try reducing batch size or using a smaller model.
125
+ - **Installation Issues**: Ensure you are using a compatible Python and PyTorch version.
126
+ - **Model Loading Errors**: Verify the model name and ensure the model weights are downloaded correctly.
127
+
128
+ ---
129
+
130
+ ## Contributing
131
+
132
+ We welcome contributions to improve the repository. Please follow the guidelines in the `CONTRIBUTING.md` file.
133
+
134
+ ---
135
+
136
+ ## License
137
+
138
+ This project is licensed under the MIT License. See the `LICENSE` file for details.
139
+
140
+ ---
141
+
142
+ ## Acknowledgments
143
+
144
+ This repository is developed and maintained by Facebook AI Research (FAIR). For more information, visit the [official repository](https://github.com/facebookresearch/esm).
esm/mcp_output/analysis.json ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "summary": {
3
+ "repository_url": "https://github.com/facebookresearch/esm",
4
+ "summary": "Repository: facebookresearch/esm\nCommit: main\nFiles analyzed: 100+\n\nEstimated tokens: 500k+",
5
+ "file_tree": "...",
6
+ "content": {},
7
+ "processed_by": "gitingest",
8
+ "success": true
9
+ },
10
+ "structure": {
11
+ "packages": [
12
+ "source.esm",
13
+ "source.scripts",
14
+ "source.examples"
15
+ ]
16
+ },
17
+ "dependencies": {
18
+ "has_environment_yml": true,
19
+ "has_requirements_txt": true,
20
+ "pyproject": false,
21
+ "setup_cfg": false,
22
+ "setup_py": true
23
+ },
24
+ "entry_points": {
25
+ "imports": [],
26
+ "cli": [],
27
+ "modules": []
28
+ },
29
+ "llm_analysis": {
30
+ "core_modules": [
31
+ {
32
+ "package": "source.esm",
33
+ "module": "__init__",
34
+ "functions": [],
35
+ "classes": [],
36
+ "description": "Entry point for the ESM core module, may expose some core APIs."
37
+ },
38
+ {
39
+ "package": "source.esm",
40
+ "module": "pretrained",
41
+ "functions": [
42
+ "load_model_and_alphabet",
43
+ "load_model_and_alphabet_local"
44
+ ],
45
+ "classes": [],
46
+ "description": "Provides functionality to load pretrained models, either from local or remote sources."
47
+ },
48
+ {
49
+ "package": "source.esm",
50
+ "module": "data",
51
+ "functions": [],
52
+ "classes": [
53
+ "Alphabet",
54
+ "BatchConverter"
55
+ ],
56
+ "description": "Module for handling protein sequence data, including alphabet definition and batch conversion."
57
+ },
58
+ {
59
+ "package": "source.esm",
60
+ "module": "inverse_folding",
61
+ "functions": [
62
+ "load_inverse_folding_model"
63
+ ],
64
+ "classes": [],
65
+ "description": "Core module for inverse folding tasks, containing the Geometric Vector Perceptron (GVP) architecture."
66
+ },
67
+ {
68
+ "package": "source.esm",
69
+ "module": "model",
70
+ "functions": [],
71
+ "classes": [
72
+ "ESM1",
73
+ "ESM2",
74
+ "MSATransformer"
75
+ ],
76
+ "description": "Core model definition module, including ESM-1, ESM-2, and MSA Transformer."
77
+ },
78
+ {
79
+ "package": "source.examples",
80
+ "module": "lm_design",
81
+ "functions": [
82
+ "generate_fixed_backbone",
83
+ "generate_free_backbone"
84
+ ],
85
+ "classes": [],
86
+ "description": "Protein language model design module, supporting fixed backbone and free generation."
87
+ },
88
+ {
89
+ "package": "source.examples",
90
+ "module": "variant_prediction",
91
+ "functions": [
92
+ "predict_variant_effect"
93
+ ],
94
+ "classes": [],
95
+ "description": "Variant effect prediction module, assessing the functional impact of mutations in protein sequences."
96
+ },
97
+ {
98
+ "package": "source.scripts",
99
+ "module": "extract",
100
+ "functions": [
101
+ "extract_features"
102
+ ],
103
+ "classes": [],
104
+ "description": "Utility module for extracting features from models."
105
+ },
106
+ {
107
+ "package": "source.scripts",
108
+ "module": "fold",
109
+ "functions": [
110
+ "predict_structure"
111
+ ],
112
+ "classes": [],
113
+ "description": "Utility module for predicting protein structures."
114
+ }
115
+ ],
116
+ "cli_commands": [
117
+ {
118
+ "command": "esm-extract",
119
+ "description": "Extract features for protein sequences from a pretrained model."
120
+ },
121
+ {
122
+ "command": "esm-fold",
123
+ "description": "Predict protein structures using the ESM model."
124
+ }
125
+ ],
126
+ "import_strategy": {
127
+ "primary": "import",
128
+ "fallback": "cli",
129
+ "confidence": 0.9
130
+ },
131
+ "dependencies": {
132
+ "required": [
133
+ "torch",
134
+ "fair-esm",
135
+ "requests",
136
+ "biopython"
137
+ ],
138
+ "optional": []
139
+ },
140
+ "risk_assessment": {
141
+ "import_feasibility": 0.9,
142
+ "intrusiveness_risk": "low",
143
+ "complexity": "high"
144
+ }
145
+ },
146
+ "deepwiki_analysis": {
147
+ "repo_url": "https://github.com/facebookresearch/esm",
148
+ "repo_name": "esm",
149
+ "analysis": "### Analysis Report: GitHub Repository `facebookresearch/esm`\n\n#### 1. What are the main functions and purposes of this repository?\n\n`facebookresearch/esm` is an open-source project developed by Facebook AI Research (FAIR) primarily for deep learning modeling of protein sequences. Its core objective is to analyze and predict protein structure, function, and variant effects using Language Models (LMs) and deep learning techniques. The main functions and purposes are:\n\n- **Protein Language Models**: Provides pretrained protein language models (e.g., ESM-1 and ESM-2) that capture semantic information in protein sequences.\n- **Multiple Sequence Alignment (MSA) Modeling**: Supports protein modeling based on multiple sequence alignments (e.g., MSA Transformer).\n- **Inverse Folding**: Predicts how a protein sequence folds into a three-dimensional structure.\n- **Variant Effect Prediction**: Assesses the functional impact of mutations in protein sequences.\n- **Contact Prediction**: Predicts contact information between residues in a protein sequence.\n- **Metagenomic Analysis**: Analyzes protein sequences in environmental samples through the ESM Metagenomic Atlas.\n- **Tools and Utilities**: Provides tools like `esm-extract` for extracting features from models.\n\n#### 2. What are the core modules and entry points of this repository?\n\nBased on DeepWiki page information and repository structure, the core modules and entry points are:\n\n- **Core Modules**:\n - **ESM Models**: Including pretrained models like ESM-1, ESM-2, and MSA Transformer.\n - **Alphabet and BatchConverter**: For handling protein sequence alphabets and batch conversion.\n - **esm-extract**: A utility module for extracting features from models.\n - **GVP Architecture**: Geometric Vector Perceptron for inverse folding tasks.\n - **ESM Metagenomic Atlas**: A submodule for metagenomic analysis.\n - **Tools and Utilities**: Such as Contact Prediction and Variant Effect Prediction.\n\n- **Main Entry Points**:\n - **Pretrained Models**: `esm.pretrained.load_model_and_alphabet()`\n - **Scripts**: `scripts/extract.py`, `scripts/fold.py`\n - **Examples**: `examples/variant_prediction/predict.py`\n\n#### 3. What are the main technology stacks and dependencies used by this repository?\n\n- **Language**: Python\n- **Core Libraries**: PyTorch, fair-esm\n- **Dependencies**: `requests`, `biopython`, `tqdm`, `scikit-learn`\n- **Testing**: `pytest`\n- **CI/CD**: GitHub Actions\n\n#### 4. Is this project suitable for conversion to an MCP (Model Context Protocol) service? Why?\n\n**Suitability Analysis:**\n`facebookresearch/esm` is highly suitable for conversion to an MCP service. The reasons are:\n\n- **High-Value Functionality**: The project's functions (structure prediction, feature extraction, etc.) are of high value and widely applicable.\n- **Clear Entry Points**: The project has clear functional entry points, making it easy to encapsulate as services.\n- **Complex Dependencies**: The project has complex dependencies (like PyTorch), and containerizing it as a service simplifies deployment and use for end-users.\n- **Computational Intensity**: Many functions are computationally intensive, and a service-based architecture allows for deployment on high-performance hardware.\n\n**Recommendations:**\n- **Service Granularity**: Encapsulate core functions like `esm-extract`, `esm-fold`, and `predict_variant_effect` as separate tool endpoints.\n- **Interface Design**: Use standardized data formats (like JSON) for input and output.\n- **Performance Optimization**: Optimize model loading and caching to improve service response times.\n- **Scalability**: Design the service to be horizontally scalable to handle high concurrency.",
150
+ "model": "gpt-4o",
151
+ "source": "llm_direct_analysis",
152
+ "success": true
153
+ },
154
+ "deepwiki_options": {
155
+ "enabled": true,
156
+ "model": "gpt-4o"
157
+ },
158
+ "risk": {
159
+ "import_feasibility": 0.9,
160
+ "intrusiveness_risk": "low",
161
+ "complexity": "high"
162
+ }
163
+ }
esm/mcp_output/env_info.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment": {
3
+ "type": "conda",
4
+ "name": "esm_774629_env",
5
+ "files": {
6
+ "pyproject_toml": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\pyproject.toml"
7
+ },
8
+ "python": "3.10",
9
+ "exec_prefix": []
10
+ },
11
+ "original_tests": {
12
+ "passed": true,
13
+ "report_path": null
14
+ },
15
+ "timestamp": 1755775471.7781281,
16
+ "conda_available": true
17
+ }
esm/mcp_output/mcp_logs/llm_statistics.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_calls": 4,
3
+ "failed_calls": 0,
4
+ "retry_count": 0,
5
+ "total_prompt_tokens": 52280,
6
+ "total_completion_tokens": 5432,
7
+ "total_tokens": 57712,
8
+ "average_prompt_tokens": 13070.0,
9
+ "average_completion_tokens": 1358.0,
10
+ "average_tokens": 14428.0
11
+ }
esm/mcp_output/mcp_logs/run_log.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": 1755775629.137685,
3
+ "node": "RunNode",
4
+ "test_result": {
5
+ "passed": false,
6
+ "report_path": null,
7
+ "stdout": "",
8
+ "stderr": "repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\mcp_service.py\", line 8, in <module>\n\n from esm import pretrained, data, inverse_folding, model\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\__init__.py\", line 6, in <module>\n\n from . import gvp_transformer\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\gvp_transformer.py\", line 16, in <module>\n\n from .features import DihedralFeatures\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\features.py\", line 73, in <module>\n\n from .gvp_modules import GVP, LayerNorm\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\gvp_modules.py\", line 33, in <module>\n\n from torch_geometric.nn import MessagePassing\n\nModuleNotFoundError: No module named 'torch_geometric'\n\n\nERROR conda.cli.main_run:execute(49): `conda run python mcp_output\\start_mcp.py` failed. (See above for error)\n"
9
+ },
10
+ "run_result": {
11
+ "success": false,
12
+ "test_passed": false,
13
+ "exit_code": 1,
14
+ "stdout": "",
15
+ "stderr": "repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\mcp_service.py\", line 8, in <module>\n\n from esm import pretrained, data, inverse_folding, model\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\__init__.py\", line 6, in <module>\n\n from . import gvp_transformer\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\gvp_transformer.py\", line 16, in <module>\n\n from .features import DihedralFeatures\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\features.py\", line 73, in <module>\n\n from .gvp_modules import GVP, LayerNorm\n\n File \"E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\esm\\inverse_folding\\gvp_modules.py\", line 33, in <module>\n\n from torch_geometric.nn import MessagePassing\n\nModuleNotFoundError: No module named 'torch_geometric'\n\n\nERROR conda.cli.main_run:execute(49): `conda run python mcp_output\\start_mcp.py` failed. (See above for error)\n",
16
+ "timestamp": 1755775629.137685,
17
+ "details": {
18
+ "command": "D:\\download\\Anaconda\\Scripts\\conda.exe run -n esm_774629_env --cwd E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm python mcp_output\\start_mcp.py",
19
+ "working_directory": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm",
20
+ "environment_type": "conda"
21
+ }
22
+ },
23
+ "environment": {
24
+ "type": "conda",
25
+ "name": "esm_774629_env",
26
+ "files": {
27
+ "pyproject_toml": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\source\\pyproject.toml"
28
+ },
29
+ "python": "3.10",
30
+ "exec_prefix": []
31
+ },
32
+ "plugin_info": {
33
+ "files": {
34
+ "mcp_output/start_mcp.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\start_mcp.py",
35
+ "mcp_output/mcp_plugin/__init__.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\__init__.py",
36
+ "mcp_output/mcp_plugin/mcp_service.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\mcp_service.py",
37
+ "mcp_output/mcp_plugin/adapter.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\adapter.py",
38
+ "mcp_output/mcp_plugin/main.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin\\main.py",
39
+ "mcp_output/requirements.txt": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\requirements.txt",
40
+ "mcp_output/README_MCP.md": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\README_MCP.md",
41
+ "mcp_output/tests_mcp/test_mcp_basic.py": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\tests_mcp\\test_mcp_basic.py"
42
+ },
43
+ "adapter_mode": "import",
44
+ "endpoints": [
45
+ "health",
46
+ "version",
47
+ "load_model_and_alphabet*",
48
+ "load_model_and_alphabet_local*",
49
+ "Alphabet",
50
+ "BatchConverter",
51
+ "load_inverse_folding_model*",
52
+ "gvptransformerencoder*",
53
+ "gvptransformerdecoder*",
54
+ "esm1*",
55
+ "esm2",
56
+ "msatransformer",
57
+ "generate_fixed_backbone*",
58
+ "generate_free_backbone*",
59
+ "predict_variant_effect*",
60
+ "extract_features*",
61
+ "predict_structure*"
62
+ ],
63
+ "mcp_dir": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\mcp_plugin",
64
+ "tests_dir": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\tests_mcp",
65
+ "main_entry": "start_mcp.py",
66
+ "readme_path": "E:\\code\\fastMCP\\fastMCP\\mcp-repo-output\\workspace\\esm\\mcp_output\\README_MCP.md",
67
+ "requirements": [
68
+ "fastmcp>=0.1.0",
69
+ "pydantic>=2.0.0"
70
+ ]
71
+ },
72
+ "fastmcp_installed": false
73
+ }
esm/mcp_output/mcp_plugin/__init__.py ADDED
File without changes
esm/mcp_output/mcp_plugin/__pycache__/adapter.cpython-310.pyc ADDED
Binary file (6.54 kB). View file
 
esm/mcp_output/mcp_plugin/__pycache__/mcp_service.cpython-310.pyc ADDED
Binary file (6.65 kB). View file
 
esm/mcp_output/mcp_plugin/adapter.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Set path
5
+ source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "source")
6
+ sys.path.insert(0, source_path)
7
+
8
+ # Import modules
9
+ try:
10
+ from esm.pretrained import load_model_and_alphabet, load_model_and_alphabet_local
11
+ from esm.data import Alphabet, BatchConverter
12
+ from esm.inverse_folding import load_inverse_folding_model
13
+ from esm.model import ESM1, ESM2, MSATransformer
14
+ from examples.lm_design.lm_design import generate_fixed_backbone, generate_free_backbone
15
+ from examples.variant_prediction.predict import predict_variant_effect
16
+ from scripts.extract import extract_features
17
+ from scripts.fold import predict_structure
18
+ except ImportError as e:
19
+ print(f"Module import failed: {e}, some functions will be unavailable.")
20
+
21
+ class Adapter:
22
+ """
23
+ MCP Import mode adapter class for encapsulating core functionality of facebookresearch/esm repository.
24
+ """
25
+
26
+ def __init__(self):
27
+ """
28
+ Initialize adapter class.
29
+ """
30
+ self.mode = "import"
31
+ self.models = {}
32
+
33
+ # ------------------------- Model Loading Module -------------------------
34
+
35
+ def load_pretrained_model(self, model_name, local_path=None):
36
+ """
37
+ Load pre-trained model.
38
+
39
+ Parameters:
40
+ - model_name: str, model name.
41
+ - local_path: str, optional, local model path.
42
+
43
+ Returns:
44
+ - dict: Information containing status and model instance.
45
+ """
46
+ try:
47
+ if local_path:
48
+ model, alphabet = load_model_and_alphabet_local(local_path)
49
+ else:
50
+ model, alphabet = load_model_and_alphabet(model_name)
51
+ self.models[model_name] = model
52
+ return {"status": "success", "model": model, "alphabet": alphabet}
53
+ except Exception as e:
54
+ return {"status": "error", "message": f"Failed to load model: {e}"}
55
+
56
+ def load_inverse_folding_model(self, model_name):
57
+ """
58
+ Load inverse folding model.
59
+
60
+ Parameters:
61
+ - model_name: str, model name.
62
+
63
+ Returns:
64
+ - dict: Information containing status and model instance.
65
+ """
66
+ try:
67
+ model = load_inverse_folding_model(model_name)
68
+ self.models[model_name] = model
69
+ return {"status": "success", "model": model}
70
+ except Exception as e:
71
+ return {"status": "error", "message": f"Failed to load inverse folding model: {e}"}
72
+
73
+ # ------------------------- Data Processing Module -------------------------
74
+
75
+ def create_alphabet(self):
76
+ """
77
+ Create alphabet for protein sequences.
78
+
79
+ Returns:
80
+ - dict: Information containing status and Alphabet instance.
81
+ """
82
+ try:
83
+ alphabet = Alphabet()
84
+ return {"status": "success", "alphabet": alphabet}
85
+ except Exception as e:
86
+ return {"status": "error", "message": f"Failed to create alphabet: {e}"}
87
+
88
+ def create_batch_converter(self, alphabet):
89
+ """
90
+ Create batch converter.
91
+
92
+ Parameters:
93
+ - alphabet: Alphabet instance.
94
+
95
+ Returns:
96
+ - dict: Information containing status and BatchConverter instance.
97
+ """
98
+ try:
99
+ batch_converter = BatchConverter(alphabet)
100
+ return {"status": "success", "batch_converter": batch_converter}
101
+ except Exception as e:
102
+ return {"status": "error", "message": f"Failed to create batch converter: {e}"}
103
+
104
+ # ------------------------- Model Instantiation Module -------------------------
105
+
106
+ def create_esm1_model(self, num_layers=12, embed_dim=768, attention_heads=12, alphabet_size=33):
107
+ """
108
+ Instantiate ESM1 model.
109
+
110
+ Parameters:
111
+ - num_layers: int, number of transformer layers (default: 12)
112
+ - embed_dim: int, embedding dimension (default: 768)
113
+ - attention_heads: int, number of attention heads (default: 12)
114
+ - alphabet_size: int, size of the alphabet (default: 33)
115
+
116
+ Returns:
117
+ - dict: Information containing status and ESM1 instance.
118
+ """
119
+ try:
120
+ model = ESM1(
121
+ num_layers=num_layers,
122
+ embed_dim=embed_dim,
123
+ attention_heads=attention_heads,
124
+ alphabet_size=alphabet_size
125
+ )
126
+ return {"status": "success", "model": model}
127
+ except Exception as e:
128
+ return {"status": "error", "message": f"Failed to instantiate ESM1 model: {e}"}
129
+
130
+ def create_esm2_model(self, num_layers=33, embed_dim=1280, attention_heads=20, alphabet_size=33):
131
+ """
132
+ Instantiate ESM2 model.
133
+
134
+ Parameters:
135
+ - num_layers: int, number of transformer layers (default: 33)
136
+ - embed_dim: int, embedding dimension (default: 1280)
137
+ - attention_heads: int, number of attention heads (default: 20)
138
+ - alphabet_size: int, size of the alphabet (default: 33)
139
+
140
+ Returns:
141
+ - dict: Information containing status and ESM2 instance.
142
+ """
143
+ try:
144
+ model = ESM2(
145
+ num_layers=num_layers,
146
+ embed_dim=embed_dim,
147
+ attention_heads=attention_heads,
148
+ alphabet_size=alphabet_size
149
+ )
150
+ return {"status": "success", "model": model}
151
+ except Exception as e:
152
+ return {"status": "error", "message": f"Failed to instantiate ESM2 model: {e}"}
153
+
154
+ def create_msa_transformer(self, num_layers=12, embed_dim=768, attention_heads=12, max_tokens_per_msa=2**14):
155
+ """
156
+ Instantiate MSA Transformer model.
157
+
158
+ Parameters:
159
+ - num_layers: int, number of transformer layers (default: 12)
160
+ - embed_dim: int, embedding dimension (default: 768)
161
+ - attention_heads: int, number of attention heads (default: 12)
162
+ - max_tokens_per_msa: int, maximum tokens per MSA (default: 2**14)
163
+
164
+ Returns:
165
+ - dict: Information containing status and MSATransformer instance.
166
+ """
167
+ try:
168
+ model = MSATransformer(
169
+ num_layers=num_layers,
170
+ embed_dim=embed_dim,
171
+ attention_heads=attention_heads,
172
+ max_tokens_per_msa=max_tokens_per_msa
173
+ )
174
+ return {"status": "success", "model": model}
175
+ except Exception as e:
176
+ return {"status": "error", "message": f"Failed to instantiate MSA Transformer model: {e}"}
177
+
178
+ # ------------------------- Function Call Module -------------------------
179
+
180
+ def generate_fixed_backbone(self, model, alphabet, pdb_file, chain_id, temperature=1.0, num_samples=1):
181
+ """
182
+ Call fixed backbone generation function.
183
+
184
+ Parameters:
185
+ - model: ESM model instance
186
+ - alphabet: Alphabet instance
187
+ - pdb_file: str, path to PDB file
188
+ - chain_id: str, chain identifier
189
+ - temperature: float, sampling temperature (default: 1.0)
190
+ - num_samples: int, number of samples to generate (default: 1)
191
+
192
+ Returns:
193
+ - dict: Information containing status and generation result.
194
+ """
195
+ try:
196
+ result = generate_fixed_backbone(
197
+ model=model,
198
+ alphabet=alphabet,
199
+ pdb_file=pdb_file,
200
+ chain_id=chain_id,
201
+ temperature=temperature,
202
+ num_samples=num_samples
203
+ )
204
+ return {"status": "success", "result": result}
205
+ except Exception as e:
206
+ return {"status": "error", "message": f"Failed to generate fixed backbone: {e}"}
207
+
208
+ def generate_free_backbone(self, model, alphabet, length, temperature=1.0, num_samples=1, device="cpu"):
209
+ """
210
+ Call free backbone generation function.
211
+
212
+ Parameters:
213
+ - model: ESM model instance
214
+ - alphabet: Alphabet instance
215
+ - length: int, desired sequence length
216
+ - temperature: float, sampling temperature (default: 1.0)
217
+ - num_samples: int, number of samples to generate (default: 1)
218
+ - device: str, device to use for computation (default: "cpu")
219
+
220
+ Returns:
221
+ - dict: Information containing status and generation result.
222
+ """
223
+ try:
224
+ result = generate_free_backbone(
225
+ model=model,
226
+ alphabet=alphabet,
227
+ length=length,
228
+ temperature=temperature,
229
+ num_samples=num_samples,
230
+ device=device
231
+ )
232
+ return {"status": "success", "result": result}
233
+ except Exception as e:
234
+ return {"status": "error", "message": f"Failed to generate free backbone: {e}"}
235
+
236
+ def predict_variant_effect(self, model, alphabet, sequence, mutations, batch_size=1, device="cpu"):
237
+ """
238
+ Call variant effect prediction function.
239
+
240
+ Parameters:
241
+ - model: ESM model instance
242
+ - alphabet: Alphabet instance
243
+ - sequence: str, wild-type protein sequence
244
+ - mutations: list, list of mutations in format ["A123V", "G456D"]
245
+ - batch_size: int, batch size for processing (default: 1)
246
+ - device: str, device to use for computation (default: "cpu")
247
+
248
+ Returns:
249
+ - dict: Information containing status and prediction result.
250
+ """
251
+ try:
252
+ result = predict_variant_effect(
253
+ model=model,
254
+ alphabet=alphabet,
255
+ sequence=sequence,
256
+ mutations=mutations,
257
+ batch_size=batch_size,
258
+ device=device
259
+ )
260
+ return {"status": "success", "result": result}
261
+ except Exception as e:
262
+ return {"status": "error", "message": f"Failed to predict variant effect: {e}"}
263
+
264
+ def extract_features(self, model, alphabet, sequences, repr_layers=[-1], include_contacts=False, device="cpu"):
265
+ """
266
+ Call feature extraction function.
267
+
268
+ Parameters:
269
+ - model: ESM model instance
270
+ - alphabet: Alphabet instance
271
+ - sequences: list, list of protein sequences
272
+ - repr_layers: list, layers to extract representations from (default: [-1])
273
+ - include_contacts: bool, whether to include contact predictions (default: False)
274
+ - device: str, device to use for computation (default: "cpu")
275
+
276
+ Returns:
277
+ - dict: Information containing status and extraction result.
278
+ """
279
+ try:
280
+ result = extract_features(
281
+ model=model,
282
+ alphabet=alphabet,
283
+ sequences=sequences,
284
+ repr_layers=repr_layers,
285
+ include_contacts=include_contacts,
286
+ device=device
287
+ )
288
+ return {"status": "success", "result": result}
289
+ except Exception as e:
290
+ return {"status": "error", "message": f"Failed to extract features: {e}"}
291
+
292
+ def predict_structure_local(self, model, alphabet, sequence, device="cpu"):
293
+ """
294
+ Call local structure prediction function.
295
+
296
+ Parameters:
297
+ - model: ESM model instance
298
+ - alphabet: Alphabet instance
299
+ - sequence: str, protein sequence
300
+ - device: str, device to use for computation (default: "cpu")
301
+
302
+ Returns:
303
+ - dict: Information containing status and prediction result.
304
+ """
305
+ try:
306
+ result = predict_structure(
307
+ model=model,
308
+ alphabet=alphabet,
309
+ sequence=sequence,
310
+ device=device
311
+ )
312
+ return {"status": "success", "result": result}
313
+ except Exception as e:
314
+ return {"status": "error", "message": f"Failed to predict structure: {e}"}
315
+
316
+ def predict_structure(self, sequence):
317
+ """
318
+ Predict protein structure using ESMFold API.
319
+
320
+ Parameters:
321
+ - sequence: str, protein amino acid sequence.
322
+
323
+ Returns:
324
+ - dict: Information containing status and prediction result.
325
+ """
326
+ try:
327
+ import requests
328
+ from Bio.PDB import PDBParser
329
+ import io
330
+
331
+ response = requests.post(
332
+ "https://api.esmatlas.com/foldSequence/v1/pdb/",
333
+ data=sequence,
334
+ timeout=300
335
+ )
336
+
337
+ if response.status_code == 200 and response.text.strip():
338
+ parser = PDBParser(QUIET=True)
339
+ pdb_io = io.StringIO(response.text)
340
+ structure = parser.get_structure("esmfold_prediction", pdb_io)
341
+
342
+ structure_info = {
343
+ "num_models": len(structure),
344
+ "num_chains": len(list(structure.get_chains())),
345
+ "num_residues": len(list(structure.get_residues())),
346
+ "num_atoms": len(list(structure.get_atoms())),
347
+ "pdb_content": response.text
348
+ }
349
+
350
+ return {"status": "success", "result": structure_info}
351
+ else:
352
+ return {"status": "error", "message": f"API returned error: {response.status_code}"}
353
+
354
+ except requests.exceptions.Timeout:
355
+ return {"status": "error", "message": "ESMFold API request timed out"}
356
+ except Exception as e:
357
+ return {"status": "error", "message": f"Error predicting structure: {e}"}
358
+
359
+ def analyze_protein_sequence(self, sequence):
360
+ """
361
+ Analyze basic features of a protein sequence.
362
+
363
+ Parameters:
364
+ - sequence: str, protein sequence.
365
+
366
+ Returns:
367
+ - dict: Information containing status and analysis result.
368
+ """
369
+ try:
370
+ length = len(sequence)
371
+ amino_acids = set(sequence)
372
+
373
+ composition = {}
374
+ for aa in amino_acids:
375
+ composition[aa] = sequence.count(aa)
376
+
377
+ result = {
378
+ "length": length,
379
+ "unique_amino_acids": len(amino_acids),
380
+ "composition": composition,
381
+ "sequence": sequence
382
+ }
383
+
384
+ return {"status": "success", "result": result}
385
+ except Exception as e:
386
+ return {"status": "error", "message": f"Failed to analyze sequence: {e}"}
387
+
388
+ def validate_protein_sequence(self, sequence):
389
+ """
390
+ Validate protein sequence format.
391
+
392
+ Parameters:
393
+ - sequence: str, protein sequence.
394
+
395
+ Returns:
396
+ - dict: Information containing status and validation result.
397
+ """
398
+ try:
399
+ valid_amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
400
+ sequence_upper = sequence.upper()
401
+
402
+ invalid_chars = set(sequence_upper) - valid_amino_acids
403
+
404
+ is_valid = len(invalid_chars) == 0
405
+
406
+ result = {
407
+ "is_valid": is_valid,
408
+ "invalid_characters": list(invalid_chars) if invalid_chars else [],
409
+ "length": len(sequence),
410
+ "uppercase_sequence": sequence_upper
411
+ }
412
+
413
+ return {"status": "success", "result": result}
414
+ except Exception as e:
415
+ return {"status": "error", "message": f"Failed to validate sequence: {e}"}
416
+
417
+ # ------------------------- Fallback Mode Handling -------------------------
418
+
419
+ def fallback_mode(self):
420
+ """
421
+ Enable fallback mode, prompting the user that some functions are unavailable.
422
+ """
423
+ return {"status": "warning", "message": "Some functions are unavailable, please check module import status."}
esm/mcp_output/mcp_plugin/main.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MCP Service Auto-Wrapper - Auto-generated
3
+ """
4
+ from mcp_service import create_app
5
+
6
+ def main():
7
+ """Main entry function"""
8
+ app = create_app()
9
+ return app
10
+
11
+ if __name__ == "__main__":
12
+ app = main()
13
+ app.run()
esm/mcp_output/mcp_plugin/mcp_service.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "source")
5
+ sys.path.insert(0, source_path)
6
+
7
+ from fastmcp import FastMCP
8
+ from esm import pretrained, data, inverse_folding, model
9
+ # from examples.lm_design.lm_design import lm_design
10
+ # from examples.variant_prediction.predict import predict
11
+ # from scripts import extract, fold
12
+
13
+ mcp = FastMCP("esm_service")
14
+
15
+ @mcp.tool(name="load_pretrained_model", description="Load a pretrained ESM model")
16
+ def load_pretrained_model(model_name: str):
17
+ """
18
+ Load a pretrained ESM model.
19
+
20
+ Parameters:
21
+ model_name (str): Model name, e.g., 'esm1b_t33_650M_UR50S'.
22
+
23
+ Returns:
24
+ dict: Contains success/result/error fields.
25
+ """
26
+ try:
27
+ model, alphabet = pretrained.load_model_and_alphabet(model_name)
28
+ return {"success": True, "result": {"model": model, "alphabet": alphabet}, "error": None}
29
+ except Exception as e:
30
+ return {"success": False, "result": None, "error": str(e)}
31
+
32
+ @mcp.tool(name="process_sequence_data", description="Process protein sequence data")
33
+ def process_sequence_data(sequences: list):
34
+ """
35
+ Process protein sequence data using Alphabet and BatchConverter.
36
+
37
+ Parameters:
38
+ sequences (list): List of (label, description, sequence) tuples.
39
+
40
+ Returns:
41
+ dict: Contains success/result/error fields.
42
+ """
43
+ try:
44
+ alphabet = data.Alphabet()
45
+ batch_converter = data.BatchConverter(alphabet)
46
+ batch = batch_converter(sequences)
47
+ return {"success": True, "result": batch, "error": None}
48
+ except Exception as e:
49
+ return {"success": False, "result": None, "error": str(e)}
50
+
51
+ @mcp.tool(name="inverse_folding_model", description="Load inverse folding model")
52
+ def inverse_folding_model():
53
+ """
54
+ Load the core model for inverse folding tasks.
55
+
56
+ Returns:
57
+ dict: Contains success/result/error fields.
58
+ """
59
+ try:
60
+ model = inverse_folding.load_inverse_folding_model()
61
+ return {"success": True, "result": model, "error": None}
62
+ except Exception as e:
63
+ return {"success": False, "result": None, "error": str(e)}
64
+
65
+ @mcp.tool(name="generate_fixed_backbone", description="Generate protein sequence with fixed backbone")
66
+ def generate_fixed_backbone(input_data: dict):
67
+ """
68
+ Generate protein sequences using a fixed backbone.
69
+
70
+ Parameters:
71
+ input_data (dict): Input data payload.
72
+
73
+ Returns:
74
+ dict: Contains success/result/error fields.
75
+ """
76
+ try:
77
+ result = lm_design.generate_fixed_backbone(input_data)
78
+ return {"success": False, "result": None, "error": "This feature is currently unavailable"}
79
+ except Exception as e:
80
+ return {"success": False, "result": None, "error": str(e)}
81
+
82
+ @mcp.tool(name="predict_variant_effect", description="Predict protein variant effects")
83
+ def predict_variant_effect(sequence: str, mutation: str):
84
+ """
85
+ Predict the effect of a mutation in a protein sequence.
86
+
87
+ Parameters:
88
+ sequence (str): Protein sequence.
89
+ mutation (str): Mutation description.
90
+
91
+ Returns:
92
+ dict: Contains success/result/error fields.
93
+ """
94
+ try:
95
+ # result = predict.predict_variant_effect(sequence, mutation)
96
+ return {"success": False, "result": None, "error": "This feature is currently unavailable"}
97
+ except Exception as e:
98
+ return {"success": False, "result": None, "error": str(e)}
99
+
100
+ @mcp.tool(name="extract_features", description="Extract features from model")
101
+ def extract_features(sequence: str):
102
+ """
103
+ Extract features of a protein sequence from a pretrained model.
104
+
105
+ Parameters:
106
+ sequence (str): Protein sequence.
107
+
108
+ Returns:
109
+ dict: Contains success/result/error fields.
110
+ """
111
+ try:
112
+ features = extract.extract_features(sequence) # type: ignore[name-defined]
113
+ return {"success": True, "result": features, "error": None}
114
+ except Exception as e:
115
+ return {"success": False, "result": None, "error": str(e)}
116
+
117
+ @mcp.tool(name="predict_structure", description="Predict protein structure using ESMFold API")
118
+ def predict_structure(sequence: str):
119
+ """
120
+ Predict protein structure using the ESMFold API.
121
+
122
+ Parameters:
123
+ sequence (str): Protein amino acid sequence.
124
+
125
+ Returns:
126
+ dict: Dictionary containing the prediction result.
127
+ """
128
+ try:
129
+ import requests
130
+ from Bio.PDB import PDBParser
131
+ import io
132
+ import datetime
133
+
134
+ # Call ESMFold API
135
+ response = requests.post(
136
+ "https://api.esmatlas.com/foldSequence/v1/pdb/",
137
+ data=sequence,
138
+ timeout=300
139
+ )
140
+
141
+ if response.status_code == 200 and response.text.strip():
142
+ parser = PDBParser(QUIET=True)
143
+ pdb_io = io.StringIO(response.text)
144
+ structure = parser.get_structure("esmfold_prediction", pdb_io)
145
+
146
+ predictions_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "predictions")
147
+ os.makedirs(predictions_dir, exist_ok=True)
148
+
149
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
150
+ pdb_filename = f"prediction_{timestamp}.pdb"
151
+ pdb_filepath = os.path.join(predictions_dir, pdb_filename)
152
+
153
+ # Save PDB file
154
+ with open(pdb_filepath, 'w') as f:
155
+ f.write(response.text)
156
+
157
+ # Extract structure info
158
+ structure_info = {
159
+ "num_models": len(structure),
160
+ "num_chains": len(list(structure.get_chains())),
161
+ "num_residues": len(list(structure.get_residues())),
162
+ "num_atoms": len(list(structure.get_atoms())),
163
+ "pdb_content": response.text,
164
+ "pdb_file_path": pdb_filepath
165
+ }
166
+
167
+ return {
168
+ "success": True,
169
+ "result": structure_info,
170
+ "error": None
171
+ }
172
+ else:
173
+ return {
174
+ "success": False,
175
+ "result": None,
176
+ "error": f"API returned error: {response.status_code}"
177
+ }
178
+
179
+ except requests.exceptions.Timeout: # type: ignore[name-defined]
180
+ return {
181
+ "success": False,
182
+ "result": None,
183
+ "error": "ESMFold API request timed out"
184
+ }
185
+ except Exception as e:
186
+ return {
187
+ "success": False,
188
+ "result": None,
189
+ "error": f"Error predicting structure: {str(e)}"
190
+ }
191
+
192
+ @mcp.tool(name="analyze_protein_sequence", description="Analyze protein sequence features")
193
+ def analyze_protein_sequence(sequence: str):
194
+ """Analyze basic features of a protein sequence"""
195
+ try:
196
+ length = len(sequence)
197
+ amino_acids = set(sequence)
198
+
199
+ # Amino acid composition
200
+ composition = {}
201
+ for aa in amino_acids:
202
+ composition[aa] = sequence.count(aa)
203
+
204
+ return {
205
+ "success": True,
206
+ "result": {
207
+ "length": length,
208
+ "unique_amino_acids": len(amino_acids),
209
+ "composition": composition,
210
+ "sequence": sequence
211
+ },
212
+ "error": None
213
+ }
214
+ except Exception as e:
215
+ return {
216
+ "success": False,
217
+ "result": None,
218
+ "error": str(e)
219
+ }
220
+
221
+ @mcp.tool(name="validate_protein_sequence", description="Validate protein sequence format")
222
+ def validate_protein_sequence(sequence: str):
223
+ """Validate that a protein sequence contains valid amino acid codes"""
224
+ try:
225
+ valid_amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
226
+ sequence_upper = sequence.upper()
227
+
228
+ invalid_chars = set(sequence_upper) - valid_amino_acids
229
+ is_valid = len(invalid_chars) == 0
230
+
231
+ return {
232
+ "success": True,
233
+ "result": {
234
+ "is_valid": is_valid,
235
+ "invalid_characters": list(invalid_chars) if invalid_chars else [],
236
+ "length": len(sequence),
237
+ "uppercase_sequence": sequence_upper
238
+ },
239
+ "error": None
240
+ }
241
+ except Exception as e:
242
+ return {
243
+ "success": False,
244
+ "result": None,
245
+ "error": str(e)
246
+ }
247
+
248
+
249
+ def create_app():
250
+ """
251
+ Create and return a FastMCP instance.
252
+
253
+ Returns:
254
+ FastMCP: MCP service instance.
255
+ """
256
+ return mcp
esm/mcp_output/predictions/prediction_20250823_235651.pdb ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ HEADER 18-OCT-22
2
+ TITLE ESMFOLD V1 PREDICTION FOR INPUT
3
+ REMARK 1
4
+ REMARK 1 REFERENCE 1
5
+ REMARK 1 AUTH ZEMING LIN, HALIL AKIN, ROSHAN RAO, BRIAN HIE, ZHONGKAI ZHU,
6
+ REMARK 1 AUTH 2 WENTING LU, NIKITA SMETANIN, ROBERT VERKUIL, ORI KABELI,
7
+ REMARK 1 AUTH 3 YANIV SHMUELI, ALLAN DOS SANTOS COSTA,
8
+ REMARK 1 AUTH 4 MARYAM FAZEL-ZARANDI, TOM SERCU, SALVATORE CANDIDO,
9
+ REMARK 1 AUTH 5 ALEXANDER RIVES
10
+ REMARK 1 TITL EVOLUTIONARY-SCALE PREDICTION OF ATOMIC LEVEL PROTEIN
11
+ REMARK 1 TITL 2 STRUCTURE WITH A LANGUAGE MODEL
12
+ REMARK 1 REF
13
+ REMARK 1 REFN
14
+ REMARK 1 PMID
15
+ REMARK 1 DOI 10.1101/2022.07.20.500902
16
+ REMARK 1
17
+ REMARK 1 LICENSE AND DISCLAIMERS
18
+ REMARK 1 ESM METAGENOMIC ATLAS DATA IS AVAILABLE UNDER
19
+ REMARK 1 A CC-BY-4.0 LICENSE FOR ACADEMIC AND COMMERCIAL USE.
20
+ REMARK 1 COPYRIGHT (C) META PLATFORMS, INC. ALL RIGHTS RESERVED.
21
+ REMARK 1 USE OF THE ESM METAGENOMIC ATLAS DATA IS SUBJECT
22
+ REMARK 1 TO THE META OPEN SOURCE TERMS OF USE AND PRIVACY POLICY.
23
+ ATOM 1 N MET A 1 3.833 -6.152 -16.813 1.00 0.56 N
24
+ ATOM 2 CA MET A 1 3.566 -6.555 -15.436 1.00 0.60 C
25
+ ATOM 3 C MET A 1 4.430 -5.763 -14.460 1.00 0.59 C
26
+ ATOM 4 CB MET A 1 3.813 -8.054 -15.256 1.00 0.51 C
27
+ ATOM 5 O MET A 1 3.939 -5.283 -13.437 1.00 0.57 O
28
+ ATOM 6 CG MET A 1 2.731 -8.762 -14.456 1.00 0.47 C
29
+ ATOM 7 SD MET A 1 2.917 -10.587 -14.484 1.00 0.54 S
30
+ ATOM 8 CE MET A 1 4.224 -10.795 -13.242 1.00 0.45 C
31
+ ATOM 9 N LYS A 2 5.782 -5.722 -14.739 1.00 0.75 N
32
+ ATOM 10 CA LYS A 2 6.694 -4.973 -13.880 1.00 0.77 C
33
+ ATOM 11 C LYS A 2 6.314 -3.495 -13.833 1.00 0.78 C
34
+ ATOM 12 CB LYS A 2 8.137 -5.128 -14.363 1.00 0.69 C
35
+ ATOM 13 O LYS A 2 6.399 -2.860 -12.780 1.00 0.75 O
36
+ ATOM 14 CG LYS A 2 8.788 -6.441 -13.954 1.00 0.60 C
37
+ ATOM 15 CD LYS A 2 10.260 -6.480 -14.343 1.00 0.59 C
38
+ ATOM 16 CE LYS A 2 10.894 -7.822 -14.003 1.00 0.55 C
39
+ ATOM 17 NZ LYS A 2 12.336 -7.867 -14.391 1.00 0.47 N
40
+ ATOM 18 N THR A 3 5.787 -3.126 -14.975 1.00 0.84 N
41
+ ATOM 19 CA THR A 3 5.441 -1.712 -15.059 1.00 0.85 C
42
+ ATOM 20 C THR A 3 4.228 -1.399 -14.187 1.00 0.87 C
43
+ ATOM 21 CB THR A 3 5.153 -1.292 -16.513 1.00 0.79 C
44
+ ATOM 22 O THR A 3 4.184 -0.360 -13.526 1.00 0.85 O
45
+ ATOM 23 CG2 THR A 3 4.989 0.220 -16.626 1.00 0.59 C
46
+ ATOM 24 OG1 THR A 3 6.241 -1.707 -17.348 1.00 0.56 O
47
+ ATOM 25 N VAL A 4 3.332 -2.302 -14.196 1.00 0.89 N
48
+ ATOM 26 CA VAL A 4 2.111 -2.067 -13.432 1.00 0.90 C
49
+ ATOM 27 C VAL A 4 2.434 -2.024 -11.941 1.00 0.91 C
50
+ ATOM 28 CB VAL A 4 1.047 -3.152 -13.715 1.00 0.87 C
51
+ ATOM 29 O VAL A 4 1.944 -1.153 -11.218 1.00 0.90 O
52
+ ATOM 30 CG1 VAL A 4 -0.154 -2.985 -12.787 1.00 0.77 C
53
+ ATOM 31 CG2 VAL A 4 0.608 -3.099 -15.178 1.00 0.76 C
54
+ ATOM 32 N ARG A 5 3.274 -2.915 -11.450 1.00 0.92 N
55
+ ATOM 33 CA ARG A 5 3.645 -2.914 -10.038 1.00 0.93 C
56
+ ATOM 34 C ARG A 5 4.425 -1.655 -9.677 1.00 0.93 C
57
+ ATOM 35 CB ARG A 5 4.470 -4.157 -9.699 1.00 0.92 C
58
+ ATOM 36 O ARG A 5 4.218 -1.075 -8.609 1.00 0.92 O
59
+ ATOM 37 CG ARG A 5 4.755 -4.321 -8.214 1.00 0.90 C
60
+ ATOM 38 CD ARG A 5 5.547 -5.589 -7.929 1.00 0.89 C
61
+ ATOM 39 NE ARG A 5 5.763 -5.779 -6.497 1.00 0.87 N
62
+ ATOM 40 NH1 ARG A 5 7.737 -6.954 -6.739 1.00 0.81 N
63
+ ATOM 41 NH2 ARG A 5 6.895 -6.538 -4.648 1.00 0.80 N
64
+ ATOM 42 CZ ARG A 5 6.798 -6.423 -5.965 1.00 0.85 C
65
+ ATOM 43 N GLN A 6 5.318 -1.260 -10.546 1.00 0.92 N
66
+ ATOM 44 CA GLN A 6 6.089 -0.047 -10.296 1.00 0.92 C
67
+ ATOM 45 C GLN A 6 5.173 1.165 -10.143 1.00 0.93 C
68
+ ATOM 46 CB GLN A 6 7.094 0.194 -11.424 1.00 0.90 C
69
+ ATOM 47 O GLN A 6 5.386 2.003 -9.264 1.00 0.92 O
70
+ ATOM 48 CG GLN A 6 8.270 -0.772 -11.415 1.00 0.80 C
71
+ ATOM 49 CD GLN A 6 9.166 -0.617 -12.630 1.00 0.75 C
72
+ ATOM 50 NE2 GLN A 6 10.400 -1.096 -12.522 1.00 0.64 N
73
+ ATOM 51 OE1 GLN A 6 8.751 -0.072 -13.658 1.00 0.70 O
74
+ ATOM 52 N GLU A 7 4.209 1.185 -11.055 1.00 0.92 N
75
+ ATOM 53 CA GLU A 7 3.260 2.291 -10.961 1.00 0.92 C
76
+ ATOM 54 C GLU A 7 2.452 2.217 -9.669 1.00 0.93 C
77
+ ATOM 55 CB GLU A 7 2.320 2.297 -12.170 1.00 0.90 C
78
+ ATOM 56 O GLU A 7 2.168 3.244 -9.049 1.00 0.92 O
79
+ ATOM 57 CG GLU A 7 2.993 2.712 -13.470 1.00 0.81 C
80
+ ATOM 58 CD GLU A 7 3.663 4.074 -13.390 1.00 0.76 C
81
+ ATOM 59 OE1 GLU A 7 3.045 5.025 -12.860 1.00 0.71 O
82
+ ATOM 60 OE2 GLU A 7 4.816 4.192 -13.863 1.00 0.68 O
83
+ ATOM 61 N ARG A 8 2.161 1.024 -9.290 1.00 0.94 N
84
+ ATOM 62 CA ARG A 8 1.415 0.847 -8.049 1.00 0.94 C
85
+ ATOM 63 C ARG A 8 2.247 1.276 -6.844 1.00 0.95 C
86
+ ATOM 64 CB ARG A 8 0.974 -0.609 -7.889 1.00 0.94 C
87
+ ATOM 65 O ARG A 8 1.748 1.966 -5.953 1.00 0.94 O
88
+ ATOM 66 CG ARG A 8 0.090 -0.856 -6.676 1.00 0.93 C
89
+ ATOM 67 CD ARG A 8 -0.399 -2.296 -6.618 1.00 0.91 C
90
+ ATOM 68 NE ARG A 8 0.707 -3.234 -6.450 1.00 0.90 N
91
+ ATOM 69 NH1 ARG A 8 0.132 -4.557 -8.255 1.00 0.83 N
92
+ ATOM 70 NH2 ARG A 8 1.970 -5.075 -6.987 1.00 0.82 N
93
+ ATOM 71 CZ ARG A 8 0.934 -4.287 -7.231 1.00 0.88 C
94
+ ATOM 72 N LEU A 9 3.502 0.910 -6.829 1.00 0.94 N
95
+ ATOM 73 CA LEU A 9 4.402 1.277 -5.741 1.00 0.95 C
96
+ ATOM 74 C LEU A 9 4.528 2.793 -5.628 1.00 0.94 C
97
+ ATOM 75 CB LEU A 9 5.784 0.654 -5.956 1.00 0.94 C
98
+ ATOM 76 O LEU A 9 4.464 3.345 -4.528 1.00 0.94 O
99
+ ATOM 77 CG LEU A 9 5.867 -0.869 -5.836 1.00 0.93 C
100
+ ATOM 78 CD1 LEU A 9 7.227 -1.365 -6.316 1.00 0.90 C
101
+ ATOM 79 CD2 LEU A 9 5.608 -1.308 -4.399 1.00 0.90 C
102
+ ATOM 80 N LYS A 10 4.686 3.490 -6.747 1.00 0.94 N
103
+ ATOM 81 CA LYS A 10 4.773 4.947 -6.761 1.00 0.94 C
104
+ ATOM 82 C LYS A 10 3.489 5.580 -6.231 1.00 0.94 C
105
+ ATOM 83 CB LYS A 10 5.061 5.454 -8.175 1.00 0.93 C
106
+ ATOM 84 O LYS A 10 3.534 6.594 -5.531 1.00 0.94 O
107
+ ATOM 85 CG LYS A 10 6.475 5.166 -8.660 1.00 0.86 C
108
+ ATOM 86 CD LYS A 10 6.688 5.659 -10.085 1.00 0.81 C
109
+ ATOM 87 CE LYS A 10 8.032 5.206 -10.639 1.00 0.73 C
110
+ ATOM 88 NZ LYS A 10 8.191 5.574 -12.077 1.00 0.64 N
111
+ ATOM 89 N SER A 11 2.412 4.973 -6.576 1.00 0.95 N
112
+ ATOM 90 CA SER A 11 1.124 5.485 -6.118 1.00 0.95 C
113
+ ATOM 91 C SER A 11 0.985 5.357 -4.605 1.00 0.95 C
114
+ ATOM 92 CB SER A 11 -0.022 4.745 -6.808 1.00 0.94 C
115
+ ATOM 93 O SER A 11 0.476 6.266 -3.945 1.00 0.95 O
116
+ ATOM 94 OG SER A 11 -0.073 5.069 -8.187 1.00 0.85 O
117
+ ATOM 95 N ILE A 12 1.404 4.270 -4.118 1.00 0.95 N
118
+ ATOM 96 CA ILE A 12 1.342 4.069 -2.674 1.00 0.96 C
119
+ ATOM 97 C ILE A 12 2.158 5.149 -1.968 1.00 0.95 C
120
+ ATOM 98 CB ILE A 12 1.851 2.666 -2.276 1.00 0.95 C
121
+ ATOM 99 O ILE A 12 1.680 5.776 -1.019 1.00 0.95 O
122
+ ATOM 100 CG1 ILE A 12 0.873 1.587 -2.754 1.00 0.94 C
123
+ ATOM 101 CG2 ILE A 12 2.067 2.579 -0.762 1.00 0.94 C
124
+ ATOM 102 CD1 ILE A 12 1.390 0.165 -2.589 1.00 0.93 C
125
+ ATOM 103 N VAL A 13 3.373 5.365 -2.383 1.00 0.95 N
126
+ ATOM 104 CA VAL A 13 4.255 6.350 -1.765 1.00 0.95 C
127
+ ATOM 105 C VAL A 13 3.625 7.738 -1.859 1.00 0.95 C
128
+ ATOM 106 CB VAL A 13 5.653 6.353 -2.424 1.00 0.94 C
129
+ ATOM 107 O VAL A 13 3.621 8.492 -0.883 1.00 0.94 O
130
+ ATOM 108 CG1 VAL A 13 6.485 7.530 -1.919 1.00 0.92 C
131
+ ATOM 109 CG2 VAL A 13 6.371 5.032 -2.155 1.00 0.92 C
132
+ ATOM 110 N ARG A 14 3.008 8.094 -3.002 1.00 0.94 N
133
+ ATOM 111 CA ARG A 14 2.369 9.390 -3.204 1.00 0.94 C
134
+ ATOM 112 C ARG A 14 1.185 9.570 -2.261 1.00 0.94 C
135
+ ATOM 113 CB ARG A 14 1.911 9.543 -4.656 1.00 0.93 C
136
+ ATOM 114 O ARG A 14 1.009 10.640 -1.674 1.00 0.94 O
137
+ ATOM 115 CG ARG A 14 3.035 9.869 -5.626 1.00 0.83 C
138
+ ATOM 116 CD ARG A 14 2.503 10.221 -7.009 1.00 0.77 C
139
+ ATOM 117 NE ARG A 14 2.183 9.025 -7.783 1.00 0.72 N
140
+ ATOM 118 NH1 ARG A 14 1.327 10.176 -9.594 1.00 0.52 N
141
+ ATOM 119 NH2 ARG A 14 1.383 7.884 -9.609 1.00 0.47 N
142
+ ATOM 120 CZ ARG A 14 1.632 9.031 -8.994 1.00 0.68 C
143
+ ATOM 121 N ILE A 15 0.432 8.498 -2.193 1.00 0.95 N
144
+ ATOM 122 CA ILE A 15 -0.742 8.563 -1.329 1.00 0.95 C
145
+ ATOM 123 C ILE A 15 -0.307 8.788 0.117 1.00 0.95 C
146
+ ATOM 124 CB ILE A 15 -1.596 7.280 -1.438 1.00 0.95 C
147
+ ATOM 125 O ILE A 15 -0.849 9.654 0.807 1.00 0.94 O
148
+ ATOM 126 CG1 ILE A 15 -2.264 7.198 -2.816 1.00 0.93 C
149
+ ATOM 127 CG2 ILE A 15 -2.640 7.229 -0.319 1.00 0.93 C
150
+ ATOM 128 CD1 ILE A 15 -2.880 5.841 -3.126 1.00 0.92 C
151
+ ATOM 129 N LEU A 16 0.689 8.051 0.584 1.00 0.95 N
152
+ ATOM 130 CA LEU A 16 1.143 8.158 1.966 1.00 0.95 C
153
+ ATOM 131 C LEU A 16 1.813 9.505 2.215 1.00 0.95 C
154
+ ATOM 132 CB LEU A 16 2.113 7.022 2.301 1.00 0.95 C
155
+ ATOM 133 O LEU A 16 1.694 10.071 3.304 1.00 0.94 O
156
+ ATOM 134 CG LEU A 16 1.521 5.612 2.331 1.00 0.95 C
157
+ ATOM 135 CD1 LEU A 16 2.609 4.587 2.634 1.00 0.94 C
158
+ ATOM 136 CD2 LEU A 16 0.397 5.525 3.357 1.00 0.93 C
159
+ ATOM 137 N GLU A 17 2.509 10.095 1.227 1.00 0.94 N
160
+ ATOM 138 CA GLU A 17 3.177 11.387 1.356 1.00 0.94 C
161
+ ATOM 139 C GLU A 17 2.165 12.522 1.484 1.00 0.93 C
162
+ ATOM 140 CB GLU A 17 4.100 11.637 0.161 1.00 0.92 C
163
+ ATOM 141 O GLU A 17 2.413 13.505 2.184 1.00 0.92 O
164
+ ATOM 142 CG GLU A 17 5.412 10.868 0.225 1.00 0.86 C
165
+ ATOM 143 CD GLU A 17 6.272 11.044 -1.016 1.00 0.81 C
166
+ ATOM 144 OE1 GLU A 17 5.739 11.459 -2.070 1.00 0.78 O
167
+ ATOM 145 OE2 GLU A 17 7.489 10.764 -0.934 1.00 0.76 O
168
+ ATOM 146 N ARG A 18 1.030 12.289 0.890 1.00 0.93 N
169
+ ATOM 147 CA ARG A 18 0.060 13.378 0.835 1.00 0.93 C
170
+ ATOM 148 C ARG A 18 -0.916 13.301 2.003 1.00 0.92 C
171
+ ATOM 149 CB ARG A 18 -0.705 13.351 -0.490 1.00 0.91 C
172
+ ATOM 150 O ARG A 18 -1.588 14.284 2.323 1.00 0.91 O
173
+ ATOM 151 CG ARG A 18 0.139 13.731 -1.697 1.00 0.83 C
174
+ ATOM 152 CD ARG A 18 -0.656 13.640 -2.992 1.00 0.79 C
175
+ ATOM 153 NE ARG A 18 0.139 14.059 -4.143 1.00 0.73 N
176
+ ATOM 154 NH1 ARG A 18 -1.546 13.730 -5.688 1.00 0.56 N
177
+ ATOM 155 NH2 ARG A 18 0.507 14.490 -6.368 1.00 0.51 N
178
+ ATOM 156 CZ ARG A 18 -0.302 14.092 -5.397 1.00 0.71 C
179
+ ATOM 157 N SER A 19 -0.853 12.101 2.564 1.00 0.92 N
180
+ ATOM 158 CA SER A 19 -1.857 11.925 3.607 1.00 0.92 C
181
+ ATOM 159 C SER A 19 -1.372 12.481 4.942 1.00 0.91 C
182
+ ATOM 160 CB SER A 19 -2.215 10.446 3.762 1.00 0.91 C
183
+ ATOM 161 O SER A 19 -0.232 12.239 5.344 1.00 0.90 O
184
+ ATOM 162 OG SER A 19 -3.219 10.272 4.748 1.00 0.85 O
185
+ ATOM 163 N LYS A 20 -2.211 13.277 5.567 1.00 0.89 N
186
+ ATOM 164 CA LYS A 20 -1.915 13.785 6.903 1.00 0.89 C
187
+ ATOM 165 C LYS A 20 -2.272 12.759 7.974 1.00 0.89 C
188
+ ATOM 166 CB LYS A 20 -2.667 15.092 7.158 1.00 0.87 C
189
+ ATOM 167 O LYS A 20 -1.768 12.824 9.097 1.00 0.87 O
190
+ ATOM 168 CG LYS A 20 -2.206 16.252 6.287 1.00 0.79 C
191
+ ATOM 169 CD LYS A 20 -2.955 17.535 6.623 1.00 0.76 C
192
+ ATOM 170 CE LYS A 20 -2.516 18.689 5.732 1.00 0.68 C
193
+ ATOM 171 NZ LYS A 20 -3.289 19.935 6.018 1.00 0.61 N
194
+ ATOM 172 N GLU A 21 -3.098 11.887 7.683 1.00 0.92 N
195
+ ATOM 173 CA GLU A 21 -3.549 10.825 8.577 1.00 0.92 C
196
+ ATOM 174 C GLU A 21 -3.148 9.451 8.047 1.00 0.92 C
197
+ ATOM 175 CB GLU A 21 -5.066 10.892 8.770 1.00 0.90 C
198
+ ATOM 176 O GLU A 21 -2.881 9.294 6.854 1.00 0.92 O
199
+ ATOM 177 CG GLU A 21 -5.548 12.188 9.406 1.00 0.84 C
200
+ ATOM 178 CD GLU A 21 -7.025 12.165 9.769 1.00 0.80 C
201
+ ATOM 179 OE1 GLU A 21 -7.741 11.233 9.339 1.00 0.78 O
202
+ ATOM 180 OE2 GLU A 21 -7.468 13.088 10.489 1.00 0.74 O
203
+ ATOM 181 N PRO A 22 -3.069 8.392 8.977 1.00 0.94 N
204
+ ATOM 182 CA PRO A 22 -2.802 7.027 8.516 1.00 0.95 C
205
+ ATOM 183 C PRO A 22 -3.802 6.555 7.462 1.00 0.95 C
206
+ ATOM 184 CB PRO A 22 -2.921 6.193 9.794 1.00 0.94 C
207
+ ATOM 185 O PRO A 22 -4.990 6.878 7.544 1.00 0.94 O
208
+ ATOM 186 CG PRO A 22 -2.711 7.168 10.907 1.00 0.93 C
209
+ ATOM 187 CD PRO A 22 -3.262 8.500 10.485 1.00 0.91 C
210
+ ATOM 188 N VAL A 23 -3.318 5.865 6.476 1.00 0.95 N
211
+ ATOM 189 CA VAL A 23 -4.137 5.301 5.408 1.00 0.95 C
212
+ ATOM 190 C VAL A 23 -4.274 3.793 5.603 1.00 0.95 C
213
+ ATOM 191 CB VAL A 23 -3.543 5.607 4.015 1.00 0.95 C
214
+ ATOM 192 O VAL A 23 -3.273 3.074 5.652 1.00 0.95 O
215
+ ATOM 193 CG1 VAL A 23 -4.477 5.115 2.910 1.00 0.93 C
216
+ ATOM 194 CG2 VAL A 23 -3.276 7.103 3.866 1.00 0.93 C
217
+ ATOM 195 N SER A 24 -5.474 3.345 5.655 1.00 0.96 N
218
+ ATOM 196 CA SER A 24 -5.672 1.925 5.923 1.00 0.96 C
219
+ ATOM 197 C SER A 24 -5.316 1.077 4.706 1.00 0.96 C
220
+ ATOM 198 CB SER A 24 -7.119 1.653 6.338 1.00 0.95 C
221
+ ATOM 199 O SER A 24 -5.386 1.552 3.571 1.00 0.95 O
222
+ ATOM 200 OG SER A 24 -7.981 1.698 5.214 1.00 0.91 O
223
+ ATOM 201 N GLY A 25 -4.862 -0.173 5.000 1.00 0.95 N
224
+ ATOM 202 CA GLY A 25 -4.628 -1.110 3.914 1.00 0.95 C
225
+ ATOM 203 C GLY A 25 -5.840 -1.304 3.021 1.00 0.95 C
226
+ ATOM 204 O GLY A 25 -5.706 -1.426 1.802 1.00 0.95 O
227
+ ATOM 205 N ALA A 26 -7.029 -1.338 3.592 1.00 0.95 N
228
+ ATOM 206 CA ALA A 26 -8.269 -1.507 2.839 1.00 0.95 C
229
+ ATOM 207 C ALA A 26 -8.487 -0.346 1.873 1.00 0.95 C
230
+ ATOM 208 CB ALA A 26 -9.456 -1.633 3.790 1.00 0.94 C
231
+ ATOM 209 O ALA A 26 -8.901 -0.551 0.729 1.00 0.95 O
232
+ ATOM 210 N GLN A 27 -8.247 0.886 2.350 1.00 0.95 N
233
+ ATOM 211 CA GLN A 27 -8.373 2.066 1.501 1.00 0.95 C
234
+ ATOM 212 C GLN A 27 -7.389 2.013 0.335 1.00 0.95 C
235
+ ATOM 213 CB GLN A 27 -8.151 3.341 2.317 1.00 0.94 C
236
+ ATOM 214 O GLN A 27 -7.757 2.295 -0.807 1.00 0.95 O
237
+ ATOM 215 CG GLN A 27 -8.308 4.623 1.511 1.00 0.86 C
238
+ ATOM 216 CD GLN A 27 -8.034 5.870 2.330 1.00 0.81 C
239
+ ATOM 217 NE2 GLN A 27 -7.923 7.010 1.657 1.00 0.73 N
240
+ ATOM 218 OE1 GLN A 27 -7.923 5.809 3.559 1.00 0.78 O
241
+ ATOM 219 N LEU A 28 -6.152 1.593 0.619 1.00 0.96 N
242
+ ATOM 220 CA LEU A 28 -5.160 1.467 -0.444 1.00 0.96 C
243
+ ATOM 221 C LEU A 28 -5.574 0.395 -1.448 1.00 0.96 C
244
+ ATOM 222 CB LEU A 28 -3.787 1.131 0.142 1.00 0.95 C
245
+ ATOM 223 O LEU A 28 -5.474 0.602 -2.659 1.00 0.95 O
246
+ ATOM 224 CG LEU A 28 -3.083 2.253 0.906 1.00 0.95 C
247
+ ATOM 225 CD1 LEU A 28 -1.839 1.718 1.608 1.00 0.93 C
248
+ ATOM 226 CD2 LEU A 28 -2.721 3.397 -0.035 1.00 0.92 C
249
+ ATOM 227 N ALA A 29 -6.050 -0.750 -0.922 1.00 0.96 N
250
+ ATOM 228 CA ALA A 29 -6.463 -1.859 -1.778 1.00 0.96 C
251
+ ATOM 229 C ALA A 29 -7.599 -1.441 -2.708 1.00 0.96 C
252
+ ATOM 230 CB ALA A 29 -6.886 -3.056 -0.931 1.00 0.95 C
253
+ ATOM 231 O ALA A 29 -7.574 -1.741 -3.903 1.00 0.95 O
254
+ ATOM 232 N GLU A 30 -8.555 -0.688 -2.175 1.00 0.96 N
255
+ ATOM 233 CA GLU A 30 -9.687 -0.188 -2.949 1.00 0.95 C
256
+ ATOM 234 C GLU A 30 -9.235 0.828 -3.995 1.00 0.95 C
257
+ ATOM 235 CB GLU A 30 -10.735 0.440 -2.026 1.00 0.94 C
258
+ ATOM 236 O GLU A 30 -9.618 0.736 -5.163 1.00 0.94 O
259
+ ATOM 237 CG GLU A 30 -12.004 0.876 -2.744 1.00 0.85 C
260
+ ATOM 238 CD GLU A 30 -13.067 1.424 -1.804 1.00 0.79 C
261
+ ATOM 239 OE1 GLU A 30 -12.798 1.542 -0.587 1.00 0.77 O
262
+ ATOM 240 OE2 GLU A 30 -14.177 1.736 -2.288 1.00 0.74 O
263
+ ATOM 241 N GLU A 31 -8.407 1.749 -3.572 1.00 0.95 N
264
+ ATOM 242 CA GLU A 31 -7.963 2.822 -4.456 1.00 0.94 C
265
+ ATOM 243 C GLU A 31 -7.129 2.277 -5.612 1.00 0.94 C
266
+ ATOM 244 CB GLU A 31 -7.159 3.864 -3.674 1.00 0.92 C
267
+ ATOM 245 O GLU A 31 -7.197 2.791 -6.730 1.00 0.93 O
268
+ ATOM 246 CG GLU A 31 -6.777 5.089 -4.493 1.00 0.78 C
269
+ ATOM 247 CD GLU A 31 -6.323 6.263 -3.642 1.00 0.72 C
270
+ ATOM 248 OE1 GLU A 31 -6.684 6.319 -2.444 1.00 0.68 O
271
+ ATOM 249 OE2 GLU A 31 -5.603 7.135 -4.176 1.00 0.66 O
272
+ ATOM 250 N LEU A 32 -6.421 1.249 -5.368 1.00 0.94 N
273
+ ATOM 251 CA LEU A 32 -5.480 0.768 -6.373 1.00 0.94 C
274
+ ATOM 252 C LEU A 32 -6.005 -0.494 -7.050 1.00 0.94 C
275
+ ATOM 253 CB LEU A 32 -4.114 0.489 -5.738 1.00 0.94 C
276
+ ATOM 254 O LEU A 32 -5.323 -1.081 -7.893 1.00 0.92 O
277
+ ATOM 255 CG LEU A 32 -3.386 1.694 -5.140 1.00 0.92 C
278
+ ATOM 256 CD1 LEU A 32 -2.156 1.238 -4.363 1.00 0.89 C
279
+ ATOM 257 CD2 LEU A 32 -2.998 2.682 -6.234 1.00 0.89 C
280
+ ATOM 258 N SER A 33 -7.187 -0.923 -6.692 1.00 0.95 N
281
+ ATOM 259 CA SER A 33 -7.876 -2.070 -7.275 1.00 0.94 C
282
+ ATOM 260 C SER A 33 -7.028 -3.333 -7.174 1.00 0.94 C
283
+ ATOM 261 CB SER A 33 -8.228 -1.798 -8.738 1.00 0.93 C
284
+ ATOM 262 O SER A 33 -6.863 -4.057 -8.159 1.00 0.93 O
285
+ ATOM 263 OG SER A 33 -9.149 -0.726 -8.842 1.00 0.85 O
286
+ ATOM 264 N VAL A 34 -6.509 -3.606 -5.951 1.00 0.94 N
287
+ ATOM 265 CA VAL A 34 -5.794 -4.840 -5.646 1.00 0.94 C
288
+ ATOM 266 C VAL A 34 -6.250 -5.380 -4.292 1.00 0.94 C
289
+ ATOM 267 CB VAL A 34 -4.264 -4.625 -5.645 1.00 0.94 C
290
+ ATOM 268 O VAL A 34 -7.013 -4.724 -3.580 1.00 0.94 O
291
+ ATOM 269 CG1 VAL A 34 -3.771 -4.252 -7.042 1.00 0.91 C
292
+ ATOM 270 CG2 VAL A 34 -3.878 -3.547 -4.634 1.00 0.91 C
293
+ ATOM 271 N SER A 35 -5.841 -6.605 -4.013 1.00 0.95 N
294
+ ATOM 272 CA SER A 35 -6.185 -7.200 -2.725 1.00 0.95 C
295
+ ATOM 273 C SER A 35 -5.372 -6.579 -1.594 1.00 0.95 C
296
+ ATOM 274 CB SER A 35 -5.960 -8.712 -2.756 1.00 0.94 C
297
+ ATOM 275 O SER A 35 -4.314 -5.993 -1.833 1.00 0.95 O
298
+ ATOM 276 OG SER A 35 -4.575 -9.014 -2.769 1.00 0.87 O
299
+ ATOM 277 N ARG A 36 -5.839 -6.717 -0.444 1.00 0.96 N
300
+ ATOM 278 CA ARG A 36 -5.113 -6.257 0.736 1.00 0.96 C
301
+ ATOM 279 C ARG A 36 -3.765 -6.959 0.857 1.00 0.96 C
302
+ ATOM 280 CB ARG A 36 -5.941 -6.489 2.001 1.00 0.93 C
303
+ ATOM 281 O ARG A 36 -2.785 -6.360 1.306 1.00 0.95 O
304
+ ATOM 282 CG ARG A 36 -7.147 -5.571 2.126 1.00 0.76 C
305
+ ATOM 283 CD ARG A 36 -7.872 -5.769 3.449 1.00 0.72 C
306
+ ATOM 284 NE ARG A 36 -7.044 -5.369 4.583 1.00 0.69 N
307
+ ATOM 285 NH1 ARG A 36 -8.302 -6.492 6.163 1.00 0.58 N
308
+ ATOM 286 NH2 ARG A 36 -6.450 -5.302 6.800 1.00 0.52 N
309
+ ATOM 287 CZ ARG A 36 -7.267 -5.722 5.846 1.00 0.64 C
310
+ ATOM 288 N GLN A 37 -3.757 -8.247 0.479 1.00 0.96 N
311
+ ATOM 289 CA GLN A 37 -2.512 -9.005 0.531 1.00 0.96 C
312
+ ATOM 290 C GLN A 37 -1.446 -8.374 -0.361 1.00 0.96 C
313
+ ATOM 291 CB GLN A 37 -2.749 -10.458 0.117 1.00 0.94 C
314
+ ATOM 292 O GLN A 37 -0.273 -8.312 0.013 1.00 0.96 O
315
+ ATOM 293 CG GLN A 37 -1.503 -11.329 0.189 1.00 0.79 C
316
+ ATOM 294 CD GLN A 37 -1.003 -11.522 1.609 1.00 0.72 C
317
+ ATOM 295 NE2 GLN A 37 0.254 -11.931 1.744 1.00 0.59 N
318
+ ATOM 296 OE1 GLN A 37 -1.740 -11.305 2.576 1.00 0.68 O
319
+ ATOM 297 N VAL A 38 -1.860 -7.916 -1.492 1.00 0.96 N
320
+ ATOM 298 CA VAL A 38 -0.934 -7.261 -2.410 1.00 0.96 C
321
+ ATOM 299 C VAL A 38 -0.384 -5.989 -1.769 1.00 0.96 C
322
+ ATOM 300 CB VAL A 38 -1.612 -6.928 -3.758 1.00 0.95 C
323
+ ATOM 301 O VAL A 38 0.810 -5.699 -1.875 1.00 0.96 O
324
+ ATOM 302 CG1 VAL A 38 -0.732 -5.996 -4.589 1.00 0.93 C
325
+ ATOM 303 CG2 VAL A 38 -1.919 -8.209 -4.531 1.00 0.92 C
326
+ ATOM 304 N ILE A 39 -1.219 -5.306 -1.044 1.00 0.96 N
327
+ ATOM 305 CA ILE A 39 -0.801 -4.074 -0.385 1.00 0.96 C
328
+ ATOM 306 C ILE A 39 0.250 -4.387 0.677 1.00 0.96 C
329
+ ATOM 307 CB ILE A 39 -2.002 -3.337 0.250 1.00 0.96 C
330
+ ATOM 308 O ILE A 39 1.263 -3.692 0.781 1.00 0.96 O
331
+ ATOM 309 CG1 ILE A 39 -2.970 -2.857 -0.838 1.00 0.94 C
332
+ ATOM 310 CG2 ILE A 39 -1.522 -2.166 1.112 1.00 0.94 C
333
+ ATOM 311 CD1 ILE A 39 -2.336 -1.930 -1.865 1.00 0.91 C
334
+ ATOM 312 N VAL A 40 0.033 -5.399 1.497 1.00 0.96 N
335
+ ATOM 313 CA VAL A 40 0.963 -5.793 2.550 1.00 0.96 C
336
+ ATOM 314 C VAL A 40 2.317 -6.146 1.940 1.00 0.97 C
337
+ ATOM 315 CB VAL A 40 0.420 -6.987 3.368 1.00 0.96 C
338
+ ATOM 316 O VAL A 40 3.362 -5.740 2.454 1.00 0.96 O
339
+ ATOM 317 CG1 VAL A 40 1.503 -7.549 4.287 1.00 0.89 C
340
+ ATOM 318 CG2 VAL A 40 -0.806 -6.565 4.177 1.00 0.89 C
341
+ ATOM 319 N GLN A 41 2.280 -6.826 0.823 1.00 0.96 N
342
+ ATOM 320 CA GLN A 41 3.510 -7.216 0.142 1.00 0.96 C
343
+ ATOM 321 C GLN A 41 4.236 -5.998 -0.424 1.00 0.96 C
344
+ ATOM 322 CB GLN A 41 3.211 -8.215 -0.977 1.00 0.95 C
345
+ ATOM 323 O GLN A 41 5.463 -5.909 -0.348 1.00 0.95 O
346
+ ATOM 324 CG GLN A 41 2.779 -9.586 -0.477 1.00 0.86 C
347
+ ATOM 325 CD GLN A 41 2.333 -10.506 -1.598 1.00 0.79 C
348
+ ATOM 326 NE2 GLN A 41 2.103 -11.773 -1.270 1.00 0.69 N
349
+ ATOM 327 OE1 GLN A 41 2.196 -10.082 -2.750 1.00 0.77 O
350
+ ATOM 328 N ASP A 42 3.536 -5.124 -0.987 1.00 0.96 N
351
+ ATOM 329 CA ASP A 42 4.119 -3.918 -1.568 1.00 0.96 C
352
+ ATOM 330 C ASP A 42 4.728 -3.027 -0.487 1.00 0.96 C
353
+ ATOM 331 CB ASP A 42 3.066 -3.140 -2.359 1.00 0.95 C
354
+ ATOM 332 O ASP A 42 5.806 -2.461 -0.677 1.00 0.95 O
355
+ ATOM 333 CG ASP A 42 2.757 -3.764 -3.709 1.00 0.94 C
356
+ ATOM 334 OD1 ASP A 42 3.553 -4.599 -4.191 1.00 0.91 O
357
+ ATOM 335 OD2 ASP A 42 1.711 -3.416 -4.297 1.00 0.92 O
358
+ ATOM 336 N ILE A 43 4.006 -2.921 0.639 1.00 0.96 N
359
+ ATOM 337 CA ILE A 43 4.512 -2.107 1.738 1.00 0.96 C
360
+ ATOM 338 C ILE A 43 5.806 -2.713 2.276 1.00 0.96 C
361
+ ATOM 339 CB ILE A 43 3.470 -1.976 2.871 1.00 0.96 C
362
+ ATOM 340 O ILE A 43 6.770 -1.994 2.548 1.00 0.96 O
363
+ ATOM 341 CG1 ILE A 43 2.293 -1.106 2.416 1.00 0.94 C
364
+ ATOM 342 CG2 ILE A 43 4.117 -1.404 4.136 1.00 0.93 C
365
+ ATOM 343 CD1 ILE A 43 2.657 0.352 2.174 1.00 0.91 C
366
+ ATOM 344 N ALA A 44 5.851 -4.052 2.411 1.00 0.96 N
367
+ ATOM 345 CA ALA A 44 7.075 -4.731 2.830 1.00 0.96 C
368
+ ATOM 346 C ALA A 44 8.218 -4.451 1.859 1.00 0.96 C
369
+ ATOM 347 CB ALA A 44 6.837 -6.234 2.947 1.00 0.96 C
370
+ ATOM 348 O ALA A 44 9.353 -4.213 2.278 1.00 0.96 O
371
+ ATOM 349 N TYR A 45 7.872 -4.425 0.639 1.00 0.95 N
372
+ ATOM 350 CA TYR A 45 8.877 -4.165 -0.387 1.00 0.95 C
373
+ ATOM 351 C TYR A 45 9.363 -2.723 -0.320 1.00 0.95 C
374
+ ATOM 352 CB TYR A 45 8.312 -4.461 -1.780 1.00 0.95 C
375
+ ATOM 353 O TYR A 45 10.568 -2.464 -0.380 1.00 0.95 O
376
+ ATOM 354 CG TYR A 45 9.316 -4.279 -2.893 1.00 0.93 C
377
+ ATOM 355 CD1 TYR A 45 10.439 -5.098 -2.985 1.00 0.89 C
378
+ ATOM 356 CD2 TYR A 45 9.143 -3.288 -3.853 1.00 0.88 C
379
+ ATOM 357 CE1 TYR A 45 11.367 -4.933 -4.008 1.00 0.88 C
380
+ ATOM 358 CE2 TYR A 45 10.065 -3.113 -4.880 1.00 0.88 C
381
+ ATOM 359 OH TYR A 45 12.088 -3.771 -5.963 1.00 0.79 O
382
+ ATOM 360 CZ TYR A 45 11.172 -3.940 -4.949 1.00 0.86 C
383
+ ATOM 361 N LEU A 46 8.521 -1.815 -0.237 1.00 0.95 N
384
+ ATOM 362 CA LEU A 46 8.894 -0.409 -0.123 1.00 0.95 C
385
+ ATOM 363 C LEU A 46 9.792 -0.181 1.088 1.00 0.95 C
386
+ ATOM 364 CB LEU A 46 7.645 0.470 -0.019 1.00 0.95 C
387
+ ATOM 365 O LEU A 46 10.747 0.595 1.021 1.00 0.95 O
388
+ ATOM 366 CG LEU A 46 6.847 0.669 -1.309 1.00 0.95 C
389
+ ATOM 367 CD1 LEU A 46 5.518 1.356 -1.010 1.00 0.93 C
390
+ ATOM 368 CD2 LEU A 46 7.656 1.476 -2.319 1.00 0.93 C
391
+ ATOM 369 N ARG A 47 9.531 -0.912 2.192 1.00 0.96 N
392
+ ATOM 370 CA ARG A 47 10.387 -0.815 3.370 1.00 0.96 C
393
+ ATOM 371 C ARG A 47 11.791 -1.332 3.072 1.00 0.96 C
394
+ ATOM 372 CB ARG A 47 9.781 -1.591 4.541 1.00 0.95 C
395
+ ATOM 373 O ARG A 47 12.781 -0.740 3.506 1.00 0.95 O
396
+ ATOM 374 CG ARG A 47 8.562 -0.923 5.158 1.00 0.93 C
397
+ ATOM 375 CD ARG A 47 8.006 -1.730 6.323 1.00 0.91 C
398
+ ATOM 376 NE ARG A 47 6.966 -0.996 7.039 1.00 0.90 N
399
+ ATOM 377 NH1 ARG A 47 5.819 -2.880 7.724 1.00 0.83 N
400
+ ATOM 378 NH2 ARG A 47 5.061 -0.797 8.306 1.00 0.81 N
401
+ ATOM 379 CZ ARG A 47 5.951 -1.559 7.688 1.00 0.87 C
402
+ ATOM 380 N SER A 48 11.822 -2.364 2.346 1.00 0.96 N
403
+ ATOM 381 CA SER A 48 13.124 -2.927 2.003 1.00 0.96 C
404
+ ATOM 382 C SER A 48 13.929 -1.969 1.131 1.00 0.95 C
405
+ ATOM 383 CB SER A 48 12.957 -4.266 1.283 1.00 0.95 C
406
+ ATOM 384 O SER A 48 15.159 -2.041 1.091 1.00 0.94 O
407
+ ATOM 385 OG SER A 48 12.598 -4.066 -0.073 1.00 0.90 O
408
+ ATOM 386 N LEU A 49 13.228 -1.054 0.452 1.00 0.95 N
409
+ ATOM 387 CA LEU A 49 13.904 -0.075 -0.393 1.00 0.94 C
410
+ ATOM 388 C LEU A 49 14.342 1.137 0.423 1.00 0.94 C
411
+ ATOM 389 CB LEU A 49 12.989 0.368 -1.537 1.00 0.94 C
412
+ ATOM 390 O LEU A 49 14.979 2.049 -0.107 1.00 0.92 O
413
+ ATOM 391 CG LEU A 49 12.691 -0.678 -2.612 1.00 0.89 C
414
+ ATOM 392 CD1 LEU A 49 11.737 -0.109 -3.656 1.00 0.83 C
415
+ ATOM 393 CD2 LEU A 49 13.984 -1.156 -3.265 1.00 0.83 C
416
+ ATOM 394 N GLY A 50 13.868 1.196 1.683 1.00 0.94 N
417
+ ATOM 395 CA GLY A 50 14.344 2.263 2.549 1.00 0.94 C
418
+ ATOM 396 C GLY A 50 13.258 3.253 2.927 1.00 0.94 C
419
+ ATOM 397 O GLY A 50 13.514 4.216 3.652 1.00 0.93 O
420
+ ATOM 398 N TYR A 51 12.070 3.128 2.433 1.00 0.94 N
421
+ ATOM 399 CA TYR A 51 10.982 4.007 2.846 1.00 0.94 C
422
+ ATOM 400 C TYR A 51 10.598 3.753 4.299 1.00 0.94 C
423
+ ATOM 401 CB TYR A 51 9.760 3.812 1.942 1.00 0.94 C
424
+ ATOM 402 O TYR A 51 10.466 2.601 4.721 1.00 0.93 O
425
+ ATOM 403 CG TYR A 51 9.971 4.290 0.526 1.00 0.93 C
426
+ ATOM 404 CD1 TYR A 51 9.767 5.625 0.183 1.00 0.92 C
427
+ ATOM 405 CD2 TYR A 51 10.373 3.408 -0.472 1.00 0.91 C
428
+ ATOM 406 CE1 TYR A 51 9.958 6.069 -1.121 1.00 0.91 C
429
+ ATOM 407 CE2 TYR A 51 10.567 3.842 -1.779 1.00 0.91 C
430
+ ATOM 408 OH TYR A 51 10.548 5.607 -3.386 1.00 0.87 O
431
+ ATOM 409 CZ TYR A 51 10.358 5.172 -2.093 1.00 0.91 C
432
+ ATOM 410 N ASN A 52 10.429 4.810 5.078 1.00 0.95 N
433
+ ATOM 411 CA ASN A 52 10.061 4.716 6.487 1.00 0.95 C
434
+ ATOM 412 C ASN A 52 8.547 4.669 6.670 1.00 0.95 C
435
+ ATOM 413 CB ASN A 52 10.656 5.885 7.276 1.00 0.94 C
436
+ ATOM 414 O ASN A 52 7.960 5.581 7.255 1.00 0.94 O
437
+ ATOM 415 CG ASN A 52 10.661 5.638 8.772 1.00 0.86 C
438
+ ATOM 416 ND2 ASN A 52 10.958 6.677 9.543 1.00 0.77 N
439
+ ATOM 417 OD1 ASN A 52 10.401 4.522 9.230 1.00 0.75 O
440
+ ATOM 418 N ILE A 53 7.934 3.537 6.187 1.00 0.95 N
441
+ ATOM 419 CA ILE A 53 6.498 3.323 6.325 1.00 0.96 C
442
+ ATOM 420 C ILE A 53 6.209 2.590 7.633 1.00 0.95 C
443
+ ATOM 421 CB ILE A 53 5.927 2.530 5.128 1.00 0.95 C
444
+ ATOM 422 O ILE A 53 6.695 1.477 7.849 1.00 0.94 O
445
+ ATOM 423 CG1 ILE A 53 6.259 3.239 3.810 1.00 0.94 C
446
+ ATOM 424 CG2 ILE A 53 4.416 2.335 5.281 1.00 0.94 C
447
+ ATOM 425 CD1 ILE A 53 5.911 2.431 2.568 1.00 0.93 C
448
+ ATOM 426 N VAL A 54 5.441 3.118 8.475 1.00 0.95 N
449
+ ATOM 427 CA VAL A 54 5.115 2.530 9.770 1.00 0.95 C
450
+ ATOM 428 C VAL A 54 3.650 2.103 9.791 1.00 0.95 C
451
+ ATOM 429 CB VAL A 54 5.400 3.514 10.927 1.00 0.94 C
452
+ ATOM 430 O VAL A 54 2.779 2.826 9.299 1.00 0.95 O
453
+ ATOM 431 CG1 VAL A 54 4.993 2.903 12.267 1.00 0.78 C
454
+ ATOM 432 CG2 VAL A 54 6.876 3.906 10.942 1.00 0.78 C
455
+ ATOM 433 N ALA A 55 3.436 0.986 10.239 1.00 0.95 N
456
+ ATOM 434 CA ALA A 55 2.086 0.462 10.428 1.00 0.95 C
457
+ ATOM 435 C ALA A 55 1.527 0.864 11.790 1.00 0.94 C
458
+ ATOM 436 CB ALA A 55 2.080 -1.057 10.281 1.00 0.94 C
459
+ ATOM 437 O ALA A 55 2.203 0.723 12.811 1.00 0.93 O
460
+ ATOM 438 N THR A 56 0.347 1.432 11.827 1.00 0.93 N
461
+ ATOM 439 CA THR A 56 -0.403 1.797 13.024 1.00 0.93 C
462
+ ATOM 440 C THR A 56 -1.730 1.046 13.080 1.00 0.93 C
463
+ ATOM 441 CB THR A 56 -0.665 3.314 13.077 1.00 0.92 C
464
+ ATOM 442 O THR A 56 -2.129 0.405 12.106 1.00 0.92 O
465
+ ATOM 443 CG2 THR A 56 0.585 4.102 12.702 1.00 0.87 C
466
+ ATOM 444 OG1 THR A 56 -1.717 3.643 12.161 1.00 0.88 O
467
+ ATOM 445 N PRO A 57 -2.436 0.978 14.221 1.00 0.92 N
468
+ ATOM 446 CA PRO A 57 -3.759 0.351 14.257 1.00 0.92 C
469
+ ATOM 447 C PRO A 57 -4.713 0.927 13.214 1.00 0.91 C
470
+ ATOM 448 CB PRO A 57 -4.253 0.651 15.675 1.00 0.91 C
471
+ ATOM 449 O PRO A 57 -5.663 0.255 12.803 1.00 0.89 O
472
+ ATOM 450 CG PRO A 57 -3.010 0.760 16.497 1.00 0.89 C
473
+ ATOM 451 CD PRO A 57 -1.939 1.398 15.660 1.00 0.88 C
474
+ ATOM 452 N ARG A 58 -4.402 2.198 12.767 1.00 0.94 N
475
+ ATOM 453 CA ARG A 58 -5.325 2.830 11.831 1.00 0.94 C
476
+ ATOM 454 C ARG A 58 -4.823 2.702 10.396 1.00 0.93 C
477
+ ATOM 455 CB ARG A 58 -5.525 4.305 12.186 1.00 0.92 C
478
+ ATOM 456 O ARG A 58 -5.517 3.089 9.454 1.00 0.92 O
479
+ ATOM 457 CG ARG A 58 -6.256 4.529 13.500 1.00 0.86 C
480
+ ATOM 458 CD ARG A 58 -6.551 6.003 13.738 1.00 0.82 C
481
+ ATOM 459 NE ARG A 58 -5.333 6.759 14.014 1.00 0.77 N
482
+ ATOM 460 NH1 ARG A 58 -6.405 8.786 14.289 1.00 0.64 N
483
+ ATOM 461 NH2 ARG A 58 -4.128 8.650 14.509 1.00 0.59 N
484
+ ATOM 462 CZ ARG A 58 -5.291 8.063 14.270 1.00 0.75 C
485
+ ATOM 463 N GLY A 59 -3.702 2.248 10.206 1.00 0.95 N
486
+ ATOM 464 CA GLY A 59 -3.182 2.097 8.857 1.00 0.95 C
487
+ ATOM 465 C GLY A 59 -1.700 2.406 8.749 1.00 0.95 C
488
+ ATOM 466 O GLY A 59 -0.945 2.190 9.699 1.00 0.95 O
489
+ ATOM 467 N TYR A 60 -1.248 2.930 7.637 1.00 0.96 N
490
+ ATOM 468 CA TYR A 60 0.157 3.167 7.325 1.00 0.96 C
491
+ ATOM 469 C TYR A 60 0.457 4.660 7.265 1.00 0.96 C
492
+ ATOM 470 CB TYR A 60 0.531 2.505 5.996 1.00 0.96 C
493
+ ATOM 471 O TYR A 60 -0.380 5.451 6.824 1.00 0.95 O
494
+ ATOM 472 CG TYR A 60 0.342 1.008 5.988 1.00 0.95 C
495
+ ATOM 473 CD1 TYR A 60 1.278 0.165 6.583 1.00 0.94 C
496
+ ATOM 474 CD2 TYR A 60 -0.772 0.433 5.386 1.00 0.94 C
497
+ ATOM 475 CE1 TYR A 60 1.109 -1.216 6.577 1.00 0.94 C
498
+ ATOM 476 CE2 TYR A 60 -0.952 -0.946 5.374 1.00 0.94 C
499
+ ATOM 477 OH TYR A 60 -0.180 -3.127 5.962 1.00 0.90 O
500
+ ATOM 478 CZ TYR A 60 -0.008 -1.761 5.971 1.00 0.93 C
501
+ ATOM 479 N VAL A 61 1.559 5.020 7.726 1.00 0.95 N
502
+ ATOM 480 CA VAL A 61 2.043 6.393 7.628 1.00 0.95 C
503
+ ATOM 481 C VAL A 61 3.484 6.399 7.124 1.00 0.95 C
504
+ ATOM 482 CB VAL A 61 1.950 7.126 8.985 1.00 0.93 C
505
+ ATOM 483 O VAL A 61 4.263 5.496 7.439 1.00 0.94 O
506
+ ATOM 484 CG1 VAL A 61 0.504 7.172 9.476 1.00 0.75 C
507
+ ATOM 485 CG2 VAL A 61 2.847 6.448 10.019 1.00 0.76 C
508
+ ATOM 486 N LEU A 62 3.751 7.326 6.213 1.00 0.95 N
509
+ ATOM 487 CA LEU A 62 5.136 7.577 5.829 1.00 0.95 C
510
+ ATOM 488 C LEU A 62 5.806 8.537 6.807 1.00 0.94 C
511
+ ATOM 489 CB LEU A 62 5.205 8.146 4.410 1.00 0.94 C
512
+ ATOM 490 O LEU A 62 5.536 9.740 6.785 1.00 0.92 O
513
+ ATOM 491 CG LEU A 62 6.604 8.327 3.819 1.00 0.89 C
514
+ ATOM 492 CD1 LEU A 62 7.315 6.982 3.717 1.00 0.83 C
515
+ ATOM 493 CD2 LEU A 62 6.525 9.001 2.453 1.00 0.82 C
516
+ ATOM 494 N ALA A 63 6.595 7.972 7.715 1.00 0.91 N
517
+ ATOM 495 CA ALA A 63 7.245 8.760 8.759 1.00 0.91 C
518
+ ATOM 496 C ALA A 63 8.462 9.500 8.211 1.00 0.88 C
519
+ ATOM 497 CB ALA A 63 7.653 7.864 9.926 1.00 0.87 C
520
+ ATOM 498 O ALA A 63 9.150 9.000 7.317 1.00 0.84 O
521
+ ATOM 499 N GLY A 64 8.844 10.801 8.768 1.00 0.79 N
522
+ ATOM 500 CA GLY A 64 10.029 11.551 8.385 1.00 0.77 C
523
+ ATOM 501 C GLY A 64 9.839 12.359 7.115 1.00 0.76 C
524
+ ATOM 502 O GLY A 64 10.811 12.831 6.522 1.00 0.70 O
525
+ ATOM 503 N GLY A 65 8.383 12.399 6.587 1.00 0.54 N
526
+ ATOM 504 CA GLY A 65 8.133 13.333 5.501 1.00 0.54 C
527
+ ATOM 505 C GLY A 65 7.712 14.709 5.983 1.00 0.53 C
528
+ ATOM 506 O GLY A 65 7.142 14.845 7.067 1.00 0.51 O
esm/mcp_output/predictions/prediction_20250830_220641.pdb ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ HEADER 18-OCT-22
2
+ TITLE ESMFOLD V1 PREDICTION FOR INPUT
3
+ REMARK 1
4
+ REMARK 1 REFERENCE 1
5
+ REMARK 1 AUTH ZEMING LIN, HALIL AKIN, ROSHAN RAO, BRIAN HIE, ZHONGKAI ZHU,
6
+ REMARK 1 AUTH 2 WENTING LU, NIKITA SMETANIN, ROBERT VERKUIL, ORI KABELI,
7
+ REMARK 1 AUTH 3 YANIV SHMUELI, ALLAN DOS SANTOS COSTA,
8
+ REMARK 1 AUTH 4 MARYAM FAZEL-ZARANDI, TOM SERCU, SALVATORE CANDIDO,
9
+ REMARK 1 AUTH 5 ALEXANDER RIVES
10
+ REMARK 1 TITL EVOLUTIONARY-SCALE PREDICTION OF ATOMIC LEVEL PROTEIN
11
+ REMARK 1 TITL 2 STRUCTURE WITH A LANGUAGE MODEL
12
+ REMARK 1 REF
13
+ REMARK 1 REFN
14
+ REMARK 1 PMID
15
+ REMARK 1 DOI 10.1101/2022.07.20.500902
16
+ REMARK 1
17
+ REMARK 1 LICENSE AND DISCLAIMERS
18
+ REMARK 1 ESM METAGENOMIC ATLAS DATA IS AVAILABLE UNDER
19
+ REMARK 1 A CC-BY-4.0 LICENSE FOR ACADEMIC AND COMMERCIAL USE.
20
+ REMARK 1 COPYRIGHT (C) META PLATFORMS, INC. ALL RIGHTS RESERVED.
21
+ REMARK 1 USE OF THE ESM METAGENOMIC ATLAS DATA IS SUBJECT
22
+ REMARK 1 TO THE META OPEN SOURCE TERMS OF USE AND PRIVACY POLICY.
23
+ ATOM 1 N MET A 1 12.955 22.762 2.808 1.00 0.40 N
24
+ ATOM 2 CA MET A 1 13.442 21.402 3.023 1.00 0.43 C
25
+ ATOM 3 C MET A 1 12.281 20.416 3.108 1.00 0.41 C
26
+ ATOM 4 CB MET A 1 14.285 21.328 4.297 1.00 0.37 C
27
+ ATOM 5 O MET A 1 11.322 20.643 3.847 1.00 0.40 O
28
+ ATOM 6 CG MET A 1 15.524 20.457 4.162 1.00 0.36 C
29
+ ATOM 7 SD MET A 1 16.674 20.646 5.579 1.00 0.46 S
30
+ ATOM 8 CE MET A 1 16.455 19.037 6.387 1.00 0.35 C
31
+ ATOM 9 N LYS A 2 11.743 19.862 2.050 1.00 0.44 N
32
+ ATOM 10 CA LYS A 2 10.680 18.862 2.091 1.00 0.48 C
33
+ ATOM 11 C LYS A 2 10.854 17.924 3.282 1.00 0.45 C
34
+ ATOM 12 CB LYS A 2 10.648 18.058 0.791 1.00 0.41 C
35
+ ATOM 13 O LYS A 2 11.967 17.481 3.573 1.00 0.44 O
36
+ ATOM 14 CG LYS A 2 9.807 18.691 -0.308 1.00 0.40 C
37
+ ATOM 15 CD LYS A 2 9.743 17.804 -1.544 1.00 0.44 C
38
+ ATOM 16 CE LYS A 2 8.931 18.452 -2.657 1.00 0.42 C
39
+ ATOM 17 NZ LYS A 2 8.883 17.595 -3.880 1.00 0.37 N
40
+ ATOM 18 N THR A 3 10.260 18.153 4.498 1.00 0.59 N
41
+ ATOM 19 CA THR A 3 10.394 17.359 5.714 1.00 0.62 C
42
+ ATOM 20 C THR A 3 10.444 15.870 5.386 1.00 0.57 C
43
+ ATOM 21 CB THR A 3 9.235 17.633 6.691 1.00 0.52 C
44
+ ATOM 22 O THR A 3 10.034 15.454 4.300 1.00 0.52 O
45
+ ATOM 23 CG2 THR A 3 9.446 18.945 7.440 1.00 0.42 C
46
+ ATOM 24 OG1 THR A 3 8.007 17.707 5.957 1.00 0.44 O
47
+ ATOM 25 N VAL A 4 11.392 14.978 5.807 1.00 0.55 N
48
+ ATOM 26 CA VAL A 4 11.478 13.527 5.688 1.00 0.55 C
49
+ ATOM 27 C VAL A 4 10.083 12.942 5.476 1.00 0.55 C
50
+ ATOM 28 CB VAL A 4 12.139 12.893 6.932 1.00 0.50 C
51
+ ATOM 29 O VAL A 4 9.905 12.019 4.678 1.00 0.54 O
52
+ ATOM 30 CG1 VAL A 4 12.097 11.368 6.849 1.00 0.40 C
53
+ ATOM 31 CG2 VAL A 4 13.578 13.384 7.080 1.00 0.41 C
54
+ ATOM 32 N ARG A 5 9.045 13.367 6.280 1.00 0.59 N
55
+ ATOM 33 CA ARG A 5 7.679 12.859 6.208 1.00 0.59 C
56
+ ATOM 34 C ARG A 5 7.124 12.979 4.793 1.00 0.58 C
57
+ ATOM 35 CB ARG A 5 6.775 13.607 7.191 1.00 0.55 C
58
+ ATOM 36 O ARG A 5 6.507 12.043 4.280 1.00 0.57 O
59
+ ATOM 37 CG ARG A 5 5.520 12.841 7.578 1.00 0.51 C
60
+ ATOM 38 CD ARG A 5 4.719 13.573 8.645 1.00 0.53 C
61
+ ATOM 39 NE ARG A 5 3.461 12.892 8.938 1.00 0.46 N
62
+ ATOM 40 NH1 ARG A 5 2.718 14.472 10.450 1.00 0.36 N
63
+ ATOM 41 NH2 ARG A 5 1.430 12.635 9.979 1.00 0.32 N
64
+ ATOM 42 CZ ARG A 5 2.539 13.335 9.788 1.00 0.47 C
65
+ ATOM 43 N GLN A 6 7.225 14.159 4.296 1.00 0.54 N
66
+ ATOM 44 CA GLN A 6 6.739 14.443 2.950 1.00 0.53 C
67
+ ATOM 45 C GLN A 6 7.437 13.564 1.916 1.00 0.54 C
68
+ ATOM 46 CB GLN A 6 6.942 15.920 2.607 1.00 0.49 C
69
+ ATOM 47 O GLN A 6 6.804 13.084 0.974 1.00 0.53 O
70
+ ATOM 48 CG GLN A 6 5.903 16.844 3.227 1.00 0.46 C
71
+ ATOM 49 CD GLN A 6 6.171 18.309 2.935 1.00 0.48 C
72
+ ATOM 50 NE2 GLN A 6 5.132 19.132 3.031 1.00 0.42 N
73
+ ATOM 51 OE1 GLN A 6 7.302 18.697 2.626 1.00 0.54 O
74
+ ATOM 52 N GLU A 7 8.751 13.356 2.208 1.00 0.53 N
75
+ ATOM 53 CA GLU A 7 9.522 12.566 1.253 1.00 0.51 C
76
+ ATOM 54 C GLU A 7 9.048 11.115 1.229 1.00 0.52 C
77
+ ATOM 55 CB GLU A 7 11.015 12.627 1.585 1.00 0.49 C
78
+ ATOM 56 O GLU A 7 8.959 10.504 0.162 1.00 0.52 O
79
+ ATOM 57 CG GLU A 7 11.700 13.898 1.106 1.00 0.45 C
80
+ ATOM 58 CD GLU A 7 13.193 13.920 1.394 1.00 0.47 C
81
+ ATOM 59 OE1 GLU A 7 13.697 12.984 2.054 1.00 0.53 O
82
+ ATOM 60 OE2 GLU A 7 13.863 14.882 0.956 1.00 0.48 O
83
+ ATOM 61 N ARG A 8 8.906 10.459 2.342 1.00 0.54 N
84
+ ATOM 62 CA ARG A 8 8.501 9.057 2.381 1.00 0.53 C
85
+ ATOM 63 C ARG A 8 7.127 8.866 1.748 1.00 0.54 C
86
+ ATOM 64 CB ARG A 8 8.491 8.540 3.821 1.00 0.52 C
87
+ ATOM 65 O ARG A 8 6.894 7.883 1.042 1.00 0.53 O
88
+ ATOM 66 CG ARG A 8 9.859 8.124 4.336 1.00 0.50 C
89
+ ATOM 67 CD ARG A 8 9.777 7.520 5.731 1.00 0.51 C
90
+ ATOM 68 NE ARG A 8 11.087 7.083 6.207 1.00 0.45 N
91
+ ATOM 69 NH1 ARG A 8 10.307 6.243 8.213 1.00 0.37 N
92
+ ATOM 70 NH2 ARG A 8 12.540 6.130 7.708 1.00 0.33 N
93
+ ATOM 71 CZ ARG A 8 11.308 6.486 7.375 1.00 0.48 C
94
+ ATOM 72 N LEU A 9 6.231 9.830 2.211 1.00 0.55 N
95
+ ATOM 73 CA LEU A 9 4.899 9.731 1.624 1.00 0.54 C
96
+ ATOM 74 C LEU A 9 4.969 9.810 0.103 1.00 0.55 C
97
+ ATOM 75 CB LEU A 9 3.992 10.840 2.163 1.00 0.52 C
98
+ ATOM 76 O LEU A 9 4.234 9.107 -0.594 1.00 0.55 O
99
+ ATOM 77 CG LEU A 9 3.348 10.585 3.527 1.00 0.50 C
100
+ ATOM 78 CD1 LEU A 9 2.845 11.893 4.128 1.00 0.45 C
101
+ ATOM 79 CD2 LEU A 9 2.211 9.577 3.402 1.00 0.47 C
102
+ ATOM 80 N LEU A 10 5.770 10.725 -0.331 1.00 0.49 N
103
+ ATOM 81 CA LEU A 10 5.935 10.895 -1.770 1.00 0.48 C
104
+ ATOM 82 C LEU A 10 6.431 9.605 -2.417 1.00 0.49 C
105
+ ATOM 83 CB LEU A 10 6.911 12.036 -2.067 1.00 0.46 C
106
+ ATOM 84 O LEU A 10 5.980 9.237 -3.503 1.00 0.49 O
107
+ ATOM 85 CG LEU A 10 6.296 13.336 -2.587 1.00 0.43 C
108
+ ATOM 86 CD1 LEU A 10 6.864 14.532 -1.829 1.00 0.38 C
109
+ ATOM 87 CD2 LEU A 10 6.540 13.481 -4.085 1.00 0.41 C
110
+ ATOM 88 N LYS A 11 7.442 8.965 -1.722 1.00 0.51 N
111
+ ATOM 89 CA LYS A 11 8.010 7.762 -2.324 1.00 0.50 C
112
+ ATOM 90 C LYS A 11 6.960 6.663 -2.454 1.00 0.51 C
113
+ ATOM 91 CB LYS A 11 9.197 7.258 -1.501 1.00 0.48 C
114
+ ATOM 92 O LYS A 11 6.940 5.930 -3.445 1.00 0.51 O
115
+ ATOM 93 CG LYS A 11 10.514 7.947 -1.828 1.00 0.46 C
116
+ ATOM 94 CD LYS A 11 11.665 7.365 -1.017 1.00 0.49 C
117
+ ATOM 95 CE LYS A 11 12.973 8.092 -1.300 1.00 0.43 C
118
+ ATOM 96 NZ LYS A 11 14.104 7.528 -0.504 1.00 0.40 N
119
+ ATOM 97 N ILE A 12 6.178 6.565 -1.403 1.00 0.53 N
120
+ ATOM 98 CA ILE A 12 5.147 5.534 -1.450 1.00 0.52 C
121
+ ATOM 99 C ILE A 12 4.183 5.820 -2.599 1.00 0.53 C
122
+ ATOM 100 CB ILE A 12 4.377 5.443 -0.114 1.00 0.51 C
123
+ ATOM 101 O ILE A 12 3.739 4.899 -3.289 1.00 0.54 O
124
+ ATOM 102 CG1 ILE A 12 5.283 4.883 0.988 1.00 0.45 C
125
+ ATOM 103 CG2 ILE A 12 3.117 4.587 -0.274 1.00 0.46 C
126
+ ATOM 104 CD1 ILE A 12 4.733 5.067 2.396 1.00 0.45 C
127
+ ATOM 105 N SER A 13 3.703 7.056 -2.575 1.00 0.53 N
128
+ ATOM 106 CA SER A 13 2.823 7.481 -3.659 1.00 0.52 C
129
+ ATOM 107 C SER A 13 3.421 7.144 -5.020 1.00 0.53 C
130
+ ATOM 108 CB SER A 13 2.549 8.982 -3.570 1.00 0.50 C
131
+ ATOM 109 O SER A 13 2.699 6.764 -5.945 1.00 0.53 O
132
+ ATOM 110 OG SER A 13 1.700 9.275 -2.474 1.00 0.49 O
133
+ ATOM 111 N LEU A 14 4.831 7.388 -5.129 1.00 0.48 N
134
+ ATOM 112 CA LEU A 14 5.477 7.149 -6.415 1.00 0.47 C
135
+ ATOM 113 C LEU A 14 5.426 5.669 -6.781 1.00 0.48 C
136
+ ATOM 114 CB LEU A 14 6.930 7.629 -6.382 1.00 0.44 C
137
+ ATOM 115 O LEU A 14 5.233 5.321 -7.948 1.00 0.47 O
138
+ ATOM 116 CG LEU A 14 7.205 9.002 -6.998 1.00 0.42 C
139
+ ATOM 117 CD1 LEU A 14 8.185 9.785 -6.130 1.00 0.36 C
140
+ ATOM 118 CD2 LEU A 14 7.740 8.854 -8.418 1.00 0.40 C
141
+ ATOM 119 N VAL A 15 5.762 4.833 -5.746 1.00 0.52 N
142
+ ATOM 120 CA VAL A 15 5.724 3.409 -6.064 1.00 0.51 C
143
+ ATOM 121 C VAL A 15 4.335 3.031 -6.573 1.00 0.52 C
144
+ ATOM 122 CB VAL A 15 6.097 2.543 -4.840 1.00 0.49 C
145
+ ATOM 123 O VAL A 15 4.204 2.238 -7.509 1.00 0.51 O
146
+ ATOM 124 CG1 VAL A 15 5.908 1.060 -5.150 1.00 0.44 C
147
+ ATOM 125 CG2 VAL A 15 7.536 2.823 -4.408 1.00 0.46 C
148
+ ATOM 126 N LEU A 16 3.392 3.725 -5.916 1.00 0.55 N
149
+ ATOM 127 CA LEU A 16 2.029 3.405 -6.323 1.00 0.54 C
150
+ ATOM 128 C LEU A 16 1.741 3.935 -7.724 1.00 0.55 C
151
+ ATOM 129 CB LEU A 16 1.021 3.986 -5.328 1.00 0.52 C
152
+ ATOM 130 O LEU A 16 0.957 3.342 -8.468 1.00 0.55 O
153
+ ATOM 131 CG LEU A 16 0.971 3.325 -3.950 1.00 0.50 C
154
+ ATOM 132 CD1 LEU A 16 0.096 4.140 -3.003 1.00 0.46 C
155
+ ATOM 133 CD2 LEU A 16 0.458 1.893 -4.061 1.00 0.47 C
156
+ ATOM 134 N SER A 17 2.343 5.204 -7.894 1.00 0.49 N
157
+ ATOM 135 CA SER A 17 2.073 5.808 -9.195 1.00 0.48 C
158
+ ATOM 136 C SER A 17 2.809 5.072 -10.309 1.00 0.49 C
159
+ ATOM 137 CB SER A 17 2.475 7.284 -9.195 1.00 0.45 C
160
+ ATOM 138 O SER A 17 2.409 5.139 -11.474 1.00 0.48 O
161
+ ATOM 139 OG SER A 17 3.880 7.422 -9.077 1.00 0.43 O
162
+ ATOM 140 N GLU A 18 4.061 4.645 -9.888 1.00 0.51 N
163
+ ATOM 141 CA GLU A 18 4.832 3.986 -10.938 1.00 0.50 C
164
+ ATOM 142 C GLU A 18 4.292 2.588 -11.224 1.00 0.51 C
165
+ ATOM 143 CB GLU A 18 6.311 3.910 -10.552 1.00 0.47 C
166
+ ATOM 144 O GLU A 18 4.769 1.907 -12.134 1.00 0.50 O
167
+ ATOM 145 CG GLU A 18 6.999 5.266 -10.482 1.00 0.45 C
168
+ ATOM 146 CD GLU A 18 8.505 5.185 -10.674 1.00 0.47 C
169
+ ATOM 147 OE1 GLU A 18 9.058 4.062 -10.671 1.00 0.48 O
170
+ ATOM 148 OE2 GLU A 18 9.138 6.254 -10.827 1.00 0.42 O
171
+ ATOM 149 N LEU A 19 3.449 2.137 -10.297 1.00 0.52 N
172
+ ATOM 150 CA LEU A 19 2.839 0.883 -10.722 1.00 0.52 C
173
+ ATOM 151 C LEU A 19 2.051 1.073 -12.014 1.00 0.52 C
174
+ ATOM 152 CB LEU A 19 1.921 0.334 -9.627 1.00 0.50 C
175
+ ATOM 153 O LEU A 19 1.400 2.103 -12.204 1.00 0.52 O
176
+ ATOM 154 CG LEU A 19 2.610 -0.333 -8.436 1.00 0.49 C
177
+ ATOM 155 CD1 LEU A 19 1.682 -0.348 -7.226 1.00 0.47 C
178
+ ATOM 156 CD2 LEU A 19 3.050 -1.748 -8.796 1.00 0.48 C
179
+ ATOM 157 N PRO A 20 2.509 0.425 -12.947 1.00 0.52 N
180
+ ATOM 158 CA PRO A 20 1.748 0.593 -14.187 1.00 0.50 C
181
+ ATOM 159 C PRO A 20 0.239 0.494 -13.972 1.00 0.52 C
182
+ ATOM 160 CB PRO A 20 2.250 -0.555 -15.067 1.00 0.49 C
183
+ ATOM 161 O PRO A 20 -0.239 -0.467 -13.363 1.00 0.51 O
184
+ ATOM 162 CG PRO A 20 3.024 -1.436 -14.141 1.00 0.46 C
185
+ ATOM 163 CD PRO A 20 3.173 -0.730 -12.824 1.00 0.46 C
186
+ ATOM 164 N LEU A 21 -0.387 1.673 -13.561 1.00 0.49 N
187
+ ATOM 165 CA LEU A 21 -1.840 1.543 -13.546 1.00 0.49 C
188
+ ATOM 166 C LEU A 21 -2.334 0.805 -14.786 1.00 0.49 C
189
+ ATOM 167 CB LEU A 21 -2.501 2.921 -13.461 1.00 0.47 C
190
+ ATOM 168 O LEU A 21 -3.422 0.225 -14.776 1.00 0.48 O
191
+ ATOM 169 CG LEU A 21 -2.547 3.567 -12.075 1.00 0.45 C
192
+ ATOM 170 CD1 LEU A 21 -2.511 5.087 -12.197 1.00 0.41 C
193
+ ATOM 171 CD2 LEU A 21 -3.791 3.116 -11.316 1.00 0.42 C
194
+ ATOM 172 N GLU A 22 -1.284 0.823 -15.746 1.00 0.49 N
195
+ ATOM 173 CA GLU A 22 -1.690 0.325 -17.056 1.00 0.49 C
196
+ ATOM 174 C GLU A 22 -1.183 -1.095 -17.289 1.00 0.49 C
197
+ ATOM 175 CB GLU A 22 -1.184 1.252 -18.164 1.00 0.45 C
198
+ ATOM 176 O GLU A 22 -1.105 -1.553 -18.431 1.00 0.47 O
199
+ ATOM 177 CG GLU A 22 -1.870 2.611 -18.191 1.00 0.43 C
200
+ ATOM 178 CD GLU A 22 -1.638 3.375 -19.484 1.00 0.46 C
201
+ ATOM 179 OE1 GLU A 22 -0.921 2.862 -20.372 1.00 0.44 O
202
+ ATOM 180 OE2 GLU A 22 -2.176 4.498 -19.610 1.00 0.40 O
203
+ ATOM 181 N SER A 23 -0.507 -1.685 -16.243 1.00 0.48 N
204
+ ATOM 182 CA SER A 23 -0.409 -3.087 -16.636 1.00 0.47 C
205
+ ATOM 183 C SER A 23 -1.772 -3.771 -16.589 1.00 0.48 C
206
+ ATOM 184 CB SER A 23 0.575 -3.830 -15.732 1.00 0.45 C
207
+ ATOM 185 O SER A 23 -2.460 -3.724 -15.568 1.00 0.46 O
208
+ ATOM 186 OG SER A 23 0.373 -3.480 -14.374 1.00 0.42 O
209
+ ATOM 187 N LYS A 24 -2.783 -3.245 -17.333 1.00 0.48 N
210
+ ATOM 188 CA LYS A 24 -3.816 -4.243 -17.597 1.00 0.47 C
211
+ ATOM 189 C LYS A 24 -3.258 -5.658 -17.476 1.00 0.48 C
212
+ ATOM 190 CB LYS A 24 -4.420 -4.035 -18.987 1.00 0.46 C
213
+ ATOM 191 O LYS A 24 -2.311 -6.020 -18.177 1.00 0.46 O
214
+ ATOM 192 CG LYS A 24 -5.249 -2.766 -19.119 1.00 0.44 C
215
+ ATOM 193 CD LYS A 24 -6.028 -2.741 -20.427 1.00 0.46 C
216
+ ATOM 194 CE LYS A 24 -6.820 -1.449 -20.582 1.00 0.40 C
217
+ ATOM 195 NZ LYS A 24 -7.573 -1.413 -21.871 1.00 0.37 N
218
+ ATOM 196 N PRO A 25 -3.101 -6.132 -16.224 1.00 0.48 N
219
+ ATOM 197 CA PRO A 25 -2.777 -7.561 -16.224 1.00 0.47 C
220
+ ATOM 198 C PRO A 25 -3.419 -8.311 -17.389 1.00 0.49 C
221
+ ATOM 199 CB PRO A 25 -3.336 -8.049 -14.886 1.00 0.46 C
222
+ ATOM 200 O PRO A 25 -4.457 -7.887 -17.905 1.00 0.48 O
223
+ ATOM 201 CG PRO A 25 -4.170 -6.915 -14.383 1.00 0.44 C
224
+ ATOM 202 CD PRO A 25 -3.916 -5.715 -15.250 1.00 0.45 C
225
+ ATOM 203 N GLU A 26 -2.595 -8.749 -18.537 1.00 0.52 N
226
+ ATOM 204 CA GLU A 26 -3.262 -9.728 -19.391 1.00 0.52 C
227
+ ATOM 205 C GLU A 26 -4.559 -10.221 -18.756 1.00 0.52 C
228
+ ATOM 206 CB GLU A 26 -2.334 -10.911 -19.678 1.00 0.49 C
229
+ ATOM 207 O GLU A 26 -4.681 -10.258 -17.530 1.00 0.50 O
230
+ ATOM 208 CG GLU A 26 -1.229 -10.598 -20.678 1.00 0.48 C
231
+ ATOM 209 CD GLU A 26 -0.782 -11.811 -21.477 1.00 0.50 C
232
+ ATOM 210 OE1 GLU A 26 -1.195 -12.945 -21.142 1.00 0.51 O
233
+ ATOM 211 OE2 GLU A 26 -0.012 -11.627 -22.446 1.00 0.47 O
234
+ ATOM 212 N PRO A 27 -5.776 -9.853 -19.347 1.00 0.50 N
235
+ ATOM 213 CA PRO A 27 -6.879 -10.502 -18.634 1.00 0.49 C
236
+ ATOM 214 C PRO A 27 -6.418 -11.681 -17.781 1.00 0.50 C
237
+ ATOM 215 CB PRO A 27 -7.800 -10.971 -19.763 1.00 0.47 C
238
+ ATOM 216 O PRO A 27 -5.769 -12.599 -18.290 1.00 0.49 O
239
+ ATOM 217 CG PRO A 27 -7.065 -10.637 -21.020 1.00 0.45 C
240
+ ATOM 218 CD PRO A 27 -5.782 -9.947 -20.654 1.00 0.45 C
241
+ ATOM 219 N VAL A 28 -5.550 -11.387 -16.664 1.00 0.49 N
242
+ ATOM 220 CA VAL A 28 -5.461 -12.521 -15.750 1.00 0.48 C
243
+ ATOM 221 C VAL A 28 -6.814 -13.223 -15.665 1.00 0.49 C
244
+ ATOM 222 CB VAL A 28 -4.999 -12.081 -14.343 1.00 0.47 C
245
+ ATOM 223 O VAL A 28 -7.852 -12.571 -15.529 1.00 0.48 O
246
+ ATOM 224 CG1 VAL A 28 -4.676 -13.297 -13.476 1.00 0.45 C
247
+ ATOM 225 CG2 VAL A 28 -3.787 -11.157 -14.445 1.00 0.46 C
248
+ ATOM 226 N GLN A 29 -7.108 -14.082 -16.749 1.00 0.53 N
249
+ ATOM 227 CA GLN A 29 -8.211 -15.035 -16.676 1.00 0.53 C
250
+ ATOM 228 C GLN A 29 -8.987 -14.878 -15.372 1.00 0.53 C
251
+ ATOM 229 CB GLN A 29 -7.692 -16.467 -16.810 1.00 0.48 C
252
+ ATOM 230 O GLN A 29 -8.402 -14.913 -14.287 1.00 0.51 O
253
+ ATOM 231 CG GLN A 29 -7.428 -16.894 -18.247 1.00 0.46 C
254
+ ATOM 232 CD GLN A 29 -7.062 -18.362 -18.366 1.00 0.48 C
255
+ ATOM 233 NE2 GLN A 29 -6.750 -18.799 -19.581 1.00 0.41 N
256
+ ATOM 234 OE1 GLN A 29 -7.058 -19.097 -17.373 1.00 0.50 O
257
+ ATOM 235 N GLY A 30 -9.845 -13.707 -15.208 1.00 0.58 N
258
+ ATOM 236 CA GLY A 30 -11.064 -13.567 -14.428 1.00 0.57 C
259
+ ATOM 237 C GLY A 30 -10.991 -12.454 -13.400 1.00 0.58 C
260
+ ATOM 238 O GLY A 30 -9.936 -12.218 -12.807 1.00 0.55 O
261
+ ATOM 239 N ALA A 31 -11.469 -11.174 -13.727 1.00 0.65 N
262
+ ATOM 240 CA ALA A 31 -11.865 -10.061 -12.868 1.00 0.64 C
263
+ ATOM 241 C ALA A 31 -11.715 -10.426 -11.394 1.00 0.65 C
264
+ ATOM 242 CB ALA A 31 -13.303 -9.645 -13.166 1.00 0.61 C
265
+ ATOM 243 O ALA A 31 -11.286 -9.600 -10.584 1.00 0.65 O
266
+ ATOM 244 N ALA A 32 -12.034 -11.668 -11.139 1.00 0.68 N
267
+ ATOM 245 CA ALA A 32 -11.937 -12.088 -9.744 1.00 0.67 C
268
+ ATOM 246 C ALA A 32 -10.490 -12.054 -9.260 1.00 0.68 C
269
+ ATOM 247 CB ALA A 32 -12.521 -13.487 -9.567 1.00 0.64 C
270
+ ATOM 248 O ALA A 32 -10.211 -11.605 -8.146 1.00 0.67 O
271
+ ATOM 249 N LEU A 33 -9.591 -12.548 -10.047 1.00 0.67 N
272
+ ATOM 250 CA LEU A 33 -8.192 -12.557 -9.631 1.00 0.66 C
273
+ ATOM 251 C LEU A 33 -7.660 -11.135 -9.484 1.00 0.67 C
274
+ ATOM 252 CB LEU A 33 -7.339 -13.332 -10.638 1.00 0.63 C
275
+ ATOM 253 O LEU A 33 -6.890 -10.849 -8.564 1.00 0.65 O
276
+ ATOM 254 CG LEU A 33 -5.950 -13.761 -10.161 1.00 0.59 C
277
+ ATOM 255 CD1 LEU A 33 -6.019 -15.128 -9.488 1.00 0.54 C
278
+ ATOM 256 CD2 LEU A 33 -4.967 -13.783 -11.326 1.00 0.55 C
279
+ ATOM 257 N GLN A 34 -8.043 -10.287 -10.357 1.00 0.68 N
280
+ ATOM 258 CA GLN A 34 -7.625 -8.892 -10.272 1.00 0.68 C
281
+ ATOM 259 C GLN A 34 -8.075 -8.262 -8.957 1.00 0.69 C
282
+ ATOM 260 CB GLN A 34 -8.176 -8.093 -11.454 1.00 0.65 C
283
+ ATOM 261 O GLN A 34 -7.304 -7.552 -8.307 1.00 0.68 O
284
+ ATOM 262 CG GLN A 34 -7.476 -6.759 -11.674 1.00 0.61 C
285
+ ATOM 263 CD GLN A 34 -7.961 -6.040 -12.920 1.00 0.59 C
286
+ ATOM 264 NE2 GLN A 34 -7.359 -4.893 -13.214 1.00 0.48 N
287
+ ATOM 265 OE1 GLN A 34 -8.868 -6.513 -13.612 1.00 0.55 O
288
+ ATOM 266 N ALA A 35 -9.311 -8.425 -8.724 1.00 0.72 N
289
+ ATOM 267 CA ALA A 35 -9.858 -7.851 -7.498 1.00 0.71 C
290
+ ATOM 268 C ALA A 35 -9.115 -8.371 -6.270 1.00 0.73 C
291
+ ATOM 269 CB ALA A 35 -11.348 -8.161 -7.384 1.00 0.70 C
292
+ ATOM 270 O ALA A 35 -8.849 -7.616 -5.332 1.00 0.73 O
293
+ ATOM 271 N GLU A 36 -8.800 -9.602 -6.322 1.00 0.75 N
294
+ ATOM 272 CA GLU A 36 -8.080 -10.212 -5.208 1.00 0.75 C
295
+ ATOM 273 C GLU A 36 -6.689 -9.603 -5.050 1.00 0.76 C
296
+ ATOM 274 CB GLU A 36 -7.971 -11.726 -5.403 1.00 0.73 C
297
+ ATOM 275 O GLU A 36 -6.256 -9.313 -3.933 1.00 0.75 O
298
+ ATOM 276 CG GLU A 36 -7.373 -12.458 -4.210 1.00 0.68 C
299
+ ATOM 277 CD GLU A 36 -7.447 -13.972 -4.336 1.00 0.65 C
300
+ ATOM 278 OE1 GLU A 36 -8.001 -14.471 -5.342 1.00 0.64 O
301
+ ATOM 279 OE2 GLU A 36 -6.948 -14.664 -3.421 1.00 0.59 O
302
+ ATOM 280 N LEU A 37 -5.986 -9.435 -6.129 1.00 0.74 N
303
+ ATOM 281 CA LEU A 37 -4.628 -8.906 -6.060 1.00 0.73 C
304
+ ATOM 282 C LEU A 37 -4.630 -7.469 -5.549 1.00 0.74 C
305
+ ATOM 283 CB LEU A 37 -3.957 -8.970 -7.435 1.00 0.71 C
306
+ ATOM 284 O LEU A 37 -3.797 -7.100 -4.718 1.00 0.73 O
307
+ ATOM 285 CG LEU A 37 -3.472 -10.348 -7.888 1.00 0.66 C
308
+ ATOM 286 CD1 LEU A 37 -3.167 -10.337 -9.382 1.00 0.61 C
309
+ ATOM 287 CD2 LEU A 37 -2.244 -10.772 -7.090 1.00 0.61 C
310
+ ATOM 288 N LEU A 38 -5.626 -6.678 -6.066 1.00 0.73 N
311
+ ATOM 289 CA LEU A 38 -5.729 -5.296 -5.610 1.00 0.73 C
312
+ ATOM 290 C LEU A 38 -6.038 -5.238 -4.118 1.00 0.75 C
313
+ ATOM 291 CB LEU A 38 -6.811 -4.551 -6.397 1.00 0.71 C
314
+ ATOM 292 O LEU A 38 -5.501 -4.391 -3.401 1.00 0.75 O
315
+ ATOM 293 CG LEU A 38 -6.433 -4.102 -7.809 1.00 0.66 C
316
+ ATOM 294 CD1 LEU A 38 -7.679 -3.690 -8.586 1.00 0.60 C
317
+ ATOM 295 CD2 LEU A 38 -5.428 -2.956 -7.755 1.00 0.60 C
318
+ ATOM 296 N SER A 39 -6.898 -6.106 -3.777 1.00 0.77 N
319
+ ATOM 297 CA SER A 39 -7.237 -6.146 -2.358 1.00 0.77 C
320
+ ATOM 298 C SER A 39 -6.013 -6.466 -1.507 1.00 0.78 C
321
+ ATOM 299 CB SER A 39 -8.333 -7.179 -2.098 1.00 0.75 C
322
+ ATOM 300 O SER A 39 -5.816 -5.871 -0.445 1.00 0.78 O
323
+ ATOM 301 OG SER A 39 -8.631 -7.259 -0.715 1.00 0.68 O
324
+ ATOM 302 N GLN A 40 -5.194 -7.402 -1.959 1.00 0.76 N
325
+ ATOM 303 CA GLN A 40 -4.005 -7.786 -1.205 1.00 0.76 C
326
+ ATOM 304 C GLN A 40 -3.014 -6.629 -1.113 1.00 0.76 C
327
+ ATOM 305 CB GLN A 40 -3.334 -9.003 -1.843 1.00 0.73 C
328
+ ATOM 306 O GLN A 40 -2.433 -6.384 -0.054 1.00 0.76 O
329
+ ATOM 307 CG GLN A 40 -4.050 -10.317 -1.563 1.00 0.66 C
330
+ ATOM 308 CD GLN A 40 -3.515 -11.466 -2.398 1.00 0.63 C
331
+ ATOM 309 NE2 GLN A 40 -4.359 -12.462 -2.645 1.00 0.54 N
332
+ ATOM 310 OE1 GLN A 40 -2.354 -11.457 -2.820 1.00 0.60 O
333
+ ATOM 311 N VAL A 41 -2.792 -5.944 -2.237 1.00 0.75 N
334
+ ATOM 312 CA VAL A 41 -1.862 -4.820 -2.237 1.00 0.74 C
335
+ ATOM 313 C VAL A 41 -2.341 -3.752 -1.256 1.00 0.75 C
336
+ ATOM 314 CB VAL A 41 -1.704 -4.213 -3.649 1.00 0.73 C
337
+ ATOM 315 O VAL A 41 -1.550 -3.215 -0.478 1.00 0.74 O
338
+ ATOM 316 CG1 VAL A 41 -0.947 -2.888 -3.586 1.00 0.65 C
339
+ ATOM 317 CG2 VAL A 41 -0.989 -5.196 -4.575 1.00 0.66 C
340
+ ATOM 318 N ARG A 42 -3.621 -3.448 -1.361 1.00 0.76 N
341
+ ATOM 319 CA ARG A 42 -4.187 -2.474 -0.434 1.00 0.76 C
342
+ ATOM 320 C ARG A 42 -3.930 -2.881 1.013 1.00 0.77 C
343
+ ATOM 321 CB ARG A 42 -5.690 -2.314 -0.673 1.00 0.74 C
344
+ ATOM 322 O ARG A 42 -3.588 -2.042 1.849 1.00 0.77 O
345
+ ATOM 323 CG ARG A 42 -6.034 -1.420 -1.855 1.00 0.69 C
346
+ ATOM 324 CD ARG A 42 -7.539 -1.280 -2.036 1.00 0.65 C
347
+ ATOM 325 NE ARG A 42 -7.867 -0.358 -3.120 1.00 0.59 N
348
+ ATOM 326 NH1 ARG A 42 -10.131 -0.814 -3.100 1.00 0.49 N
349
+ ATOM 327 NH2 ARG A 42 -9.276 0.705 -4.589 1.00 0.45 N
350
+ ATOM 328 CZ ARG A 42 -9.091 -0.158 -3.601 1.00 0.59 C
351
+ ATOM 329 N GLN A 43 -4.215 -4.159 1.316 1.00 0.78 N
352
+ ATOM 330 CA GLN A 43 -4.020 -4.652 2.675 1.00 0.78 C
353
+ ATOM 331 C GLN A 43 -2.557 -4.540 3.096 1.00 0.78 C
354
+ ATOM 332 CB GLN A 43 -4.490 -6.103 2.793 1.00 0.76 C
355
+ ATOM 333 O GLN A 43 -2.260 -4.175 4.235 1.00 0.77 O
356
+ ATOM 334 CG GLN A 43 -4.504 -6.630 4.222 1.00 0.70 C
357
+ ATOM 335 CD GLN A 43 -5.452 -5.861 5.123 1.00 0.66 C
358
+ ATOM 336 NE2 GLN A 43 -4.992 -5.534 6.326 1.00 0.60 N
359
+ ATOM 337 OE1 GLN A 43 -6.588 -5.562 4.740 1.00 0.66 O
360
+ ATOM 338 N ASP A 44 -1.615 -4.885 2.249 1.00 0.76 N
361
+ ATOM 339 CA ASP A 44 -0.192 -4.815 2.566 1.00 0.75 C
362
+ ATOM 340 C ASP A 44 0.231 -3.383 2.883 1.00 0.75 C
363
+ ATOM 341 CB ASP A 44 0.643 -5.366 1.409 1.00 0.73 C
364
+ ATOM 342 O ASP A 44 1.022 -3.152 3.800 1.00 0.74 O
365
+ ATOM 343 CG ASP A 44 0.532 -6.874 1.262 1.00 0.68 C
366
+ ATOM 344 OD1 ASP A 44 0.048 -7.544 2.199 1.00 0.65 O
367
+ ATOM 345 OD2 ASP A 44 0.935 -7.396 0.200 1.00 0.66 O
368
+ ATOM 346 N ILE A 45 -0.307 -2.445 2.112 1.00 0.76 N
369
+ ATOM 347 CA ILE A 45 -0.006 -1.042 2.371 1.00 0.75 C
370
+ ATOM 348 C ILE A 45 -0.514 -0.652 3.758 1.00 0.76 C
371
+ ATOM 349 CB ILE A 45 -0.625 -0.122 1.295 1.00 0.73 C
372
+ ATOM 350 O ILE A 45 0.201 -0.011 4.531 1.00 0.75 O
373
+ ATOM 351 CG1 ILE A 45 0.082 -0.322 -0.050 1.00 0.67 C
374
+ ATOM 352 CG2 ILE A 45 -0.561 1.343 1.735 1.00 0.67 C
375
+ ATOM 353 CD1 ILE A 45 -0.609 0.363 -1.221 1.00 0.64 C
376
+ ATOM 354 N ALA A 46 -1.744 -0.973 3.967 1.00 0.76 N
377
+ ATOM 355 CA ALA A 46 -2.318 -0.646 5.269 1.00 0.75 C
378
+ ATOM 356 C ALA A 46 -1.481 -1.235 6.401 1.00 0.76 C
379
+ ATOM 357 CB ALA A 46 -3.757 -1.149 5.358 1.00 0.74 C
380
+ ATOM 358 O ALA A 46 -1.239 -0.572 7.413 1.00 0.75 O
381
+ ATOM 359 N ASN A 47 -1.100 -2.513 6.275 1.00 0.77 N
382
+ ATOM 360 CA ASN A 47 -0.287 -3.170 7.293 1.00 0.76 C
383
+ ATOM 361 C ASN A 47 1.046 -2.456 7.493 1.00 0.75 C
384
+ ATOM 362 CB ASN A 47 -0.053 -4.638 6.927 1.00 0.74 C
385
+ ATOM 363 O ASN A 47 1.503 -2.291 8.626 1.00 0.74 O
386
+ ATOM 364 CG ASN A 47 -1.313 -5.474 7.032 1.00 0.70 C
387
+ ATOM 365 ND2 ASN A 47 -1.301 -6.646 6.410 1.00 0.68 N
388
+ ATOM 366 OD1 ASN A 47 -2.291 -5.068 7.666 1.00 0.70 O
389
+ ATOM 367 N SER A 48 1.642 -2.096 6.364 1.00 0.75 N
390
+ ATOM 368 CA SER A 48 2.925 -1.406 6.457 1.00 0.73 C
391
+ ATOM 369 C SER A 48 2.787 -0.078 7.194 1.00 0.73 C
392
+ ATOM 370 CB SER A 48 3.508 -1.167 5.064 1.00 0.71 C
393
+ ATOM 371 O SER A 48 3.641 0.278 8.009 1.00 0.70 O
394
+ ATOM 372 OG SER A 48 3.811 -2.397 4.427 1.00 0.65 O
395
+ ATOM 373 N LEU A 49 1.734 0.640 6.845 1.00 0.72 N
396
+ ATOM 374 CA LEU A 49 1.508 1.919 7.510 1.00 0.72 C
397
+ ATOM 375 C LEU A 49 1.271 1.721 9.003 1.00 0.72 C
398
+ ATOM 376 CB LEU A 49 0.315 2.645 6.884 1.00 0.70 C
399
+ ATOM 377 O LEU A 49 1.763 2.499 9.823 1.00 0.71 O
400
+ ATOM 378 CG LEU A 49 0.534 3.228 5.487 1.00 0.66 C
401
+ ATOM 379 CD1 LEU A 49 -0.788 3.717 4.904 1.00 0.61 C
402
+ ATOM 380 CD2 LEU A 49 1.554 4.360 5.533 1.00 0.62 C
403
+ ATOM 381 N ASN A 50 0.475 0.748 9.327 1.00 0.74 N
404
+ ATOM 382 CA ASN A 50 0.188 0.475 10.732 1.00 0.73 C
405
+ ATOM 383 C ASN A 50 1.452 0.099 11.499 1.00 0.73 C
406
+ ATOM 384 CB ASN A 50 -0.860 -0.633 10.860 1.00 0.70 C
407
+ ATOM 385 O ASN A 50 1.622 0.493 12.654 1.00 0.71 O
408
+ ATOM 386 CG ASN A 50 -2.278 -0.113 10.732 1.00 0.65 C
409
+ ATOM 387 ND2 ASN A 50 -3.214 -1.008 10.440 1.00 0.63 N
410
+ ATOM 388 OD1 ASN A 50 -2.530 1.084 10.892 1.00 0.63 O
411
+ ATOM 389 N ALA A 51 2.296 -0.751 10.857 1.00 0.70 N
412
+ ATOM 390 CA ALA A 51 3.518 -1.202 11.518 1.00 0.69 C
413
+ ATOM 391 C ALA A 51 4.408 -0.020 11.892 1.00 0.68 C
414
+ ATOM 392 CB ALA A 51 4.279 -2.175 10.622 1.00 0.67 C
415
+ ATOM 393 O ALA A 51 5.038 -0.020 12.952 1.00 0.67 O
416
+ ATOM 394 N VAL A 52 4.525 0.951 11.010 1.00 0.67 N
417
+ ATOM 395 CA VAL A 52 5.352 2.120 11.288 1.00 0.66 C
418
+ ATOM 396 C VAL A 52 4.723 2.942 12.411 1.00 0.65 C
419
+ ATOM 397 CB VAL A 52 5.540 2.995 10.029 1.00 0.63 C
420
+ ATOM 398 O VAL A 52 5.431 3.495 13.256 1.00 0.64 O
421
+ ATOM 399 CG1 VAL A 52 6.261 4.296 10.378 1.00 0.57 C
422
+ ATOM 400 CG2 VAL A 52 6.309 2.226 8.956 1.00 0.58 C
423
+ ATOM 401 N ALA A 53 3.415 3.071 12.338 1.00 0.62 N
424
+ ATOM 402 CA ALA A 53 2.746 3.869 13.362 1.00 0.60 C
425
+ ATOM 403 C ALA A 53 2.961 3.272 14.750 1.00 0.61 C
426
+ ATOM 404 CB ALA A 53 1.253 3.980 13.059 1.00 0.58 C
427
+ ATOM 405 O ALA A 53 2.919 3.988 15.753 1.00 0.61 O
428
+ ATOM 406 N THR A 54 3.105 1.936 14.815 1.00 0.61 N
429
+ ATOM 407 CA THR A 54 3.158 1.297 16.126 1.00 0.61 C
430
+ ATOM 408 C THR A 54 4.591 1.253 16.648 1.00 0.61 C
431
+ ATOM 409 CB THR A 54 2.583 -0.131 16.075 1.00 0.57 C
432
+ ATOM 410 O THR A 54 4.835 0.805 17.770 1.00 0.59 O
433
+ ATOM 411 CG2 THR A 54 1.101 -0.114 15.712 1.00 0.51 C
434
+ ATOM 412 OG1 THR A 54 3.295 -0.892 15.092 1.00 0.54 O
435
+ ATOM 413 N ARG A 55 5.532 1.631 15.809 1.00 0.65 N
436
+ ATOM 414 CA ARG A 55 6.903 1.567 16.303 1.00 0.65 C
437
+ ATOM 415 C ARG A 55 7.118 2.544 17.453 1.00 0.65 C
438
+ ATOM 416 CB ARG A 55 7.895 1.859 15.175 1.00 0.61 C
439
+ ATOM 417 O ARG A 55 6.621 3.672 17.418 1.00 0.63 O
440
+ ATOM 418 CG ARG A 55 8.075 0.707 14.199 1.00 0.59 C
441
+ ATOM 419 CD ARG A 55 9.132 1.017 13.148 1.00 0.59 C
442
+ ATOM 420 NE ARG A 55 9.264 -0.067 12.180 1.00 0.52 N
443
+ ATOM 421 NH1 ARG A 55 10.992 0.916 11.004 1.00 0.41 N
444
+ ATOM 422 NH2 ARG A 55 10.179 -1.128 10.359 1.00 0.36 N
445
+ ATOM 423 CZ ARG A 55 10.145 -0.090 11.183 1.00 0.55 C
446
+ ATOM 424 N PRO A 56 7.605 2.129 18.545 1.00 0.62 N
447
+ ATOM 425 CA PRO A 56 7.968 3.039 19.634 1.00 0.61 C
448
+ ATOM 426 C PRO A 56 8.765 4.249 19.151 1.00 0.62 C
449
+ ATOM 427 CB PRO A 56 8.815 2.161 20.559 1.00 0.56 C
450
+ ATOM 428 O PRO A 56 9.656 4.109 18.310 1.00 0.59 O
451
+ ATOM 429 CG PRO A 56 8.612 0.769 20.056 1.00 0.53 C
452
+ ATOM 430 CD PRO A 56 8.067 0.841 18.659 1.00 0.55 C
453
+ ATOM 431 N GLY A 57 8.356 5.535 19.374 1.00 0.58 N
454
+ ATOM 432 CA GLY A 57 8.945 6.814 19.010 1.00 0.58 C
455
+ ATOM 433 C GLY A 57 8.368 7.395 17.733 1.00 0.58 C
456
+ ATOM 434 O GLY A 57 8.738 8.497 17.323 1.00 0.57 O
457
+ ATOM 435 N TYR A 58 7.679 6.470 16.960 1.00 0.54 N
458
+ ATOM 436 CA TYR A 58 7.100 7.135 15.799 1.00 0.54 C
459
+ ATOM 437 C TYR A 58 6.095 8.199 16.224 1.00 0.53 C
460
+ ATOM 438 CB TYR A 58 6.422 6.116 14.878 1.00 0.50 C
461
+ ATOM 439 O TYR A 58 6.065 9.295 15.659 1.00 0.52 O
462
+ ATOM 440 CG TYR A 58 6.176 6.630 13.480 1.00 0.49 C
463
+ ATOM 441 CD1 TYR A 58 4.946 7.177 13.124 1.00 0.47 C
464
+ ATOM 442 CD2 TYR A 58 7.173 6.568 12.512 1.00 0.48 C
465
+ ATOM 443 CE1 TYR A 58 4.714 7.650 11.837 1.00 0.47 C
466
+ ATOM 444 CE2 TYR A 58 6.953 7.038 11.222 1.00 0.48 C
467
+ ATOM 445 OH TYR A 58 5.498 8.043 9.618 1.00 0.44 O
468
+ ATOM 446 CZ TYR A 58 5.722 7.577 10.894 1.00 0.46 C
469
+ ATOM 447 N LEU A 59 5.154 7.928 17.166 1.00 0.50 N
470
+ ATOM 448 CA LEU A 59 4.166 8.868 17.683 1.00 0.50 C
471
+ ATOM 449 C LEU A 59 4.547 9.344 19.081 1.00 0.50 C
472
+ ATOM 450 CB LEU A 59 2.777 8.224 17.711 1.00 0.47 C
473
+ ATOM 451 O LEU A 59 3.832 10.147 19.684 1.00 0.49 O
474
+ ATOM 452 CG LEU A 59 2.130 7.949 16.353 1.00 0.45 C
475
+ ATOM 453 CD1 LEU A 59 0.903 7.059 16.521 1.00 0.42 C
476
+ ATOM 454 CD2 LEU A 59 1.756 9.257 15.663 1.00 0.44 C
477
+ ATOM 455 N ALA A 60 5.718 9.319 19.497 1.00 0.46 N
478
+ ATOM 456 CA ALA A 60 6.022 9.769 20.853 1.00 0.46 C
479
+ ATOM 457 C ALA A 60 7.279 10.633 20.876 1.00 0.46 C
480
+ ATOM 458 CB ALA A 60 6.186 8.572 21.787 1.00 0.42 C
481
+ ATOM 459 O ALA A 60 8.383 10.140 20.637 1.00 0.44 O
482
+ ATOM 460 N GLY A 61 7.357 11.746 20.071 1.00 0.45 N
483
+ ATOM 461 CA GLY A 61 8.144 12.780 20.724 1.00 0.45 C
484
+ ATOM 462 C GLY A 61 7.607 14.179 20.487 1.00 0.46 C
485
+ ATOM 463 O GLY A 61 7.660 14.688 19.366 1.00 0.44 O
486
+ ATOM 464 N GLY A 62 6.464 14.557 21.219 1.00 0.33 N
487
+ ATOM 465 CA GLY A 62 6.288 15.850 21.860 1.00 0.36 C
488
+ ATOM 466 C GLY A 62 7.363 16.852 21.487 1.00 0.33 C
489
+ ATOM 467 O GLY A 62 8.470 16.469 21.102 1.00 0.32 O
esm/mcp_output/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastmcp>=0.1.0
2
+ pydantic>=2.0.0
3
+ requests
4
+ biopython
esm/mcp_output/start_mcp.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ MCP Service Startup Entry Point
4
+ """
5
+ import sys
6
+ import os
7
+
8
+ project_root = os.path.dirname(os.path.abspath(__file__))
9
+ mcp_plugin_dir = os.path.join(project_root, "mcp_plugin")
10
+ if mcp_plugin_dir not in sys.path:
11
+ sys.path.insert(0, mcp_plugin_dir)
12
+
13
+ # Set path to point to source directory
14
+ source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "source")
15
+ sys.path.insert(0, source_path)
16
+
17
+ from mcp_service import create_app
18
+
19
+ def main():
20
+ """Start FastMCP Service"""
21
+ app = create_app()
22
+ # Use environment variable to configure port, default 8000
23
+ port = int(os.environ.get("MCP_PORT", "8000"))
24
+
25
+ # Select transport mode based on environment variable
26
+ transport = os.environ.get("MCP_TRANSPORT", "stdio")
27
+ if transport == "http":
28
+ app.run(transport="http", host="0.0.0.0", port=port)
29
+ else:
30
+ # Default to STDIO mode
31
+ app.run()
32
+
33
+ if __name__ == "__main__":
34
+ main()
esm/mcp_output/tests_mcp/test_mcp_basic.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MCP Service Basic Tests
3
+ """
4
+ import sys
5
+ import os
6
+
7
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8
+ mcp_plugin_dir = os.path.join(project_root, "mcp_plugin")
9
+ if mcp_plugin_dir not in sys.path:
10
+ sys.path.insert(0, mcp_plugin_dir)
11
+
12
+ source_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "source")
13
+ sys.path.insert(0, source_path)
14
+
15
+ def test_import_mcp_service():
16
+ """Test that the MCP service can be imported correctly"""
17
+ try:
18
+ from mcp_service import create_app
19
+ app = create_app()
20
+ assert app is not None
21
+ print("MCP service imported successfully")
22
+ return True
23
+ except Exception as e:
24
+ print(f"Failed to import MCP service: {e}")
25
+ return False
26
+
27
+ def test_adapter_init():
28
+ """Test that the adapter can be initialized correctly"""
29
+ try:
30
+ from adapter import Adapter
31
+ adapter = Adapter()
32
+ assert adapter is not None
33
+ print("Adapter initialized successfully")
34
+ return True
35
+ except Exception as e:
36
+ print(f"Failed to initialize adapter: {e}")
37
+ return False
38
+
39
+ if __name__ == "__main__":
40
+ print("Running MCP service basic tests...")
41
+ test1 = test_import_mcp_service()
42
+ test2 = test_adapter_init()
43
+
44
+ if test1 and test2:
45
+ print("All basic tests passed")
46
+ sys.exit(0)
47
+ else:
48
+ print("Some tests failed")
49
+ sys.exit(1)
esm/mcp_output/tests_smoke/test_smoke.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib, sys
2
+ import os
3
+
4
+ # Add current directory to Python path
5
+ sys.path.insert(0, os.getcwd())
6
+
7
+ source_dir = os.path.join(os.getcwd(), "source")
8
+ if os.path.exists(source_dir):
9
+ sys.path.insert(0, source_dir)
10
+
11
+
12
+ try:
13
+ importlib.import_module("esm")
14
+ print("OK - Successfully imported esm")
15
+ except ImportError as e:
16
+ print(f"Failed to import esm: {e}")
17
+ fallback_packages = []
18
+
19
+ fallback_packages = ['esm']
20
+
21
+ for pkg in fallback_packages:
22
+ try:
23
+ importlib.import_module(pkg)
24
+ print(f"OK - Successfully imported {pkg}")
25
+ break
26
+ except ImportError:
27
+ continue
28
+ else:
29
+ print("All import attempts failed")
esm/source/.flake8 ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 99
3
+ ignore = E203,W503
4
+ exclude =
5
+ .git,
6
+ __pycache__,
7
+ build,
8
+ dist,
9
+ experimental
10
+ third_party
esm/source/.git-blame-ignore-revs ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Migrate code style to Black
2
+ 8bc7e948cd9bf0b6d1f2113e221ef548ef663377
esm/source/.github/ISSUE_TEMPLATE/bug.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: "[Bug Report]"
3
+ about: "Create a bug report. For other questions: see Discussions tab."
4
+
5
+ ---
6
+
7
+ NOTE: if this is not a bug report, please use the [GitHub Discussions](https://github.com/facebookresearch/esm/discussions) for support questions (How do I do X?), feature requests, ideas, showcasing new applications, etc.
8
+
9
+
10
+ **Bug description**
11
+ Please enter a clear and concise description of what the bug is.
12
+
13
+ **Reproduction steps**
14
+ Enter steps to reproduce the behavior.
15
+
16
+ **Expected behavior**
17
+ Give a clear and concise description of what you expected to happen.
18
+
19
+ **Logs**
20
+ Please paste the command line output:
21
+
22
+ ```
23
+ Output goes here
24
+ ```
25
+
26
+ **Additional context**
27
+ Add any other context about the problem here. (like proxy settings, network setup, overall goals, etc.)
esm/source/.gitignore ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tensor dumps
2
+ *.pt
3
+ # Compiler Output #
4
+ ###################
5
+ *.py[cod]
6
+ *.so
7
+ *.o
8
+ *.exe
9
+ *.class
10
+
11
+ # Folders #
12
+ ###########
13
+ bin/
14
+ build/
15
+ dist/
16
+ local/
17
+ tmp/
18
+ __pycache__/
19
+ *.egg-info/
20
+ .idea/
21
+ .ipynb_checkpoints/
22
+ .vscode/
23
+ esm/dev
24
+
25
+ # Junk #
26
+ ########
27
+ .DS_Store*
28
+ .*.swp
29
+ *.swp
30
+ *.log
31
+ *~
esm/source/CODE_OF_CONDUCT.rst ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Code of Conduct
2
+ ===============
3
+
4
+ Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please `read the full text`__ so that you can understand what actions will and will not be tolerated.
5
+
6
+ __ https://code.facebook.com/codeofconduct
esm/source/CONTRIBUTING.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to esm
2
+ We want to make contributing to this project as easy and transparent as
3
+ possible.
4
+
5
+ ## Pull Requests
6
+ We actively welcome your pull requests.
7
+
8
+ 1. Fork the repo and create your branch from `master`.
9
+ 2. If you've added code that should be tested, add tests.
10
+ 3. If you've changed APIs, update the documentation.
11
+ 4. Ensure the test suite passes.
12
+ 5. Make sure your code lints.
13
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14
+
15
+ ## Contributor License Agreement ("CLA")
16
+ In order to accept your pull request, we need you to submit a CLA. You only need
17
+ to do this once to work on any of Facebook's open source projects.
18
+
19
+ Complete your CLA here: <https://code.facebook.com/cla>
20
+
21
+ ## Issues
22
+ We use GitHub issues to track public bugs. Please ensure your description is
23
+ clear and has sufficient instructions to be able to reproduce the issue.
24
+
25
+ Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26
+ disclosure of security bugs. In those cases, please go through the process
27
+ outlined on that page and do not file a public issue.
28
+
29
+ ## License
30
+ By contributing to icp-block-mdp, you agree that your contributions will be licensed
31
+ under the LICENSE file in the root directory of this source tree.
esm/source/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Meta Platforms, Inc. and affiliates.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
esm/source/README.md ADDED
@@ -0,0 +1,795 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evolutionary Scale Modeling
2
+
3
+ [![atlas](https://user-images.githubusercontent.com/3605224/199301187-a9e38b3f-71a7-44be-94f4-db0d66143c53.png)](https://esmatlas.com)
4
+
5
+ ***Update April 2023:*** Code for the two simultaneous preprints on protein design is now released! Code for "Language models generalize beyond natural proteins" is under [examples/lm-design/](examples/lm-design/). Code for "A high-level programming language for generative protein design" is under [examples/protein-programming-language/](examples/protein-programming-language/).
6
+
7
+ This repository contains code and pre-trained weights for **Transformer protein language models** from the Meta Fundamental AI Research Protein Team (FAIR), including our state-of-the-art [**ESM-2** and **ESMFold**](#esmfold), as well as [**MSA Transformer**](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1), [**ESM-1v**](#zs_variant) for predicting variant effects and [**ESM-IF1**](#invf) for inverse folding.
8
+ Transformer protein language models were introduced in the [2019 preprint](https://doi.org/10.1101/622803) of the paper ["Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences"](https://doi.org/10.1073/pnas.2016239118).
9
+ ESM-2 outperforms all tested single-sequence protein language models across a range of structure prediction tasks.
10
+ ESMFold harnesses the ESM-2 language model to generate accurate structure predictions end to end directly from the sequence of a protein.
11
+
12
+ In November 2022, we released `v0` of the [ESM Metagenomic Atlas](https://esmatlas.com), an open atlas of 617 million predicted metagenomic protein structures.
13
+ The Atlas was updated in March 2023 in collaboration with EBI. The new `v2023_02` adds another 150 million predicted structures to the Atlas, as well as pre-computed ESM2 embeddings.
14
+ Bulk download, blog post and the resources provided on the Atlas website are documented [on this README](#atlas).
15
+
16
+ In December 2022, we released two simultaneous preprints on protein design.
17
+ * "Language models generalize beyond natural proteins" ([PAPER](https://doi.org/10.1101/2022.12.21.521521), [CODE](examples/lm-design/)) uses ESM2 to design de novo proteins. The code and data associated with the preprint can be found [here](examples/lm-design/).
18
+ * "A high-level programming language for generative protein design" ([PAPER](https://doi.org/10.1101/2022.12.21.521526), [CODE](examples/protein-programming-language/)) uses ESMFold to design proteins according to a high-level programming language.
19
+
20
+
21
+
22
+ <details><summary><b>Citation</b></summary>
23
+ For ESM2, ESMFold and ESM Atlas:
24
+ ```bibtex
25
+ @article{lin2023evolutionary,
26
+ title = {Evolutionary-scale prediction of atomic-level protein structure with a language model},
27
+ author = {Zeming Lin and Halil Akin and Roshan Rao and Brian Hie and Zhongkai Zhu and Wenting Lu and Nikita Smetanin and Robert Verkuil and Ori Kabeli and Yaniv Shmueli and Allan dos Santos Costa and Maryam Fazel-Zarandi and Tom Sercu and Salvatore Candido and Alexander Rives },
28
+ journal = {Science},
29
+ volume = {379},
30
+ number = {6637},
31
+ pages = {1123-1130},
32
+ year = {2023},
33
+ doi = {10.1126/science.ade2574},
34
+ URL = {https://www.science.org/doi/abs/10.1126/science.ade2574},
35
+ note={Earlier versions as preprint: bioRxiv 2022.07.20.500902},
36
+ }
37
+ ```
38
+
39
+ For transformer protein language models:
40
+ ```bibtex
41
+ @article{rives2021biological,
42
+ title={Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences},
43
+ author={Rives, Alexander and Meier, Joshua and Sercu, Tom and Goyal, Siddharth and Lin, Zeming and Liu, Jason and Guo, Demi and Ott, Myle and Zitnick, C Lawrence and Ma, Jerry and others},
44
+ journal={Proceedings of the National Academy of Sciences},
45
+ volume={118},
46
+ number={15},
47
+ pages={e2016239118},
48
+ year={2021},
49
+ publisher={National Acad Sciences},
50
+ note={bioRxiv 10.1101/622803},
51
+ doi={10.1073/pnas.2016239118},
52
+ url={https://www.pnas.org/doi/full/10.1073/pnas.2016239118},
53
+ }
54
+ ```
55
+ </details>
56
+
57
+ <details open><summary><b>Table of contents</b></summary>
58
+
59
+ - [Main models you should use](#main-models)
60
+ - [Usage](#usage)
61
+ - [Quick Start](#quickstart)
62
+ - [Getting Started with this repository](#repostart)
63
+ - [ESMFold Structure Prediction](#esmfold)
64
+ - [Compute embeddings in bulk from FASTA](#bulk_fasta)
65
+ - [CPU offloading for inference with large models](#fsdp)
66
+ - [Zero-shot variant prediction](#zs_variant)
67
+ - [Inverse folding](#invf)
68
+ - [ESM Metagenomic Atlas](#atlas)
69
+ - [Notebooks](#notebooks)
70
+ - [Available Models and Datasets](#available)
71
+ - [Pre-trained Models](#available-models)
72
+ - [ESM Structural Split Dataset](#available-esmssd)
73
+ - [Pre-training Dataset Split](#available-pretraining-split)
74
+ - [Comparison to related works](#perf_related)
75
+ - [Citations](#citations)
76
+ - [License](#license)
77
+ </details>
78
+
79
+ <details><summary><b>What's New</b></summary>
80
+
81
+ - April 2023: Code for the protein design preprints released under [examples/lm-design/](examples/lm-design/).
82
+ - March 2023: We release an update to the ESM Metagenomic Atlas, `v2023_02`. See [website](https://esmatlas.com/) and [bulk download details](#atlas).
83
+ - December 2022: The Meta Fundamental AI Research Protein Team (FAIR) released two simultaneous preprints on protein design:
84
+ ["Language models generalize beyond natural proteins" (Verkuil, Kabeli, et al., 2022)](https://doi.org/10.1101/2022.12.21.521521), and ["A high-level programming language for generative protein design" (Hie, Candido, et al., 2022)](https://doi.org/10.1101/2022.12.21.521521).
85
+ - November 2022: ESM Metagenomic Atlas, a repository of 600M+ metagenomics structures released, see [website](https://esmatlas.com/) and [bulk download details](#atlas)
86
+ - November 2022: ESMFold - new end-to-end structure prediction model released (see [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574))
87
+ - August 2022: ESM-2 - new SOTA Language Models released (see [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574))
88
+ - April 2022: New inverse folding model ESM-IF1 released, trained on CATH and UniRef50 predicted structures.
89
+ - August 2021: Added flexibility to tokenizer to allow for spaces and special tokens (like `<mask>`) in sequence.
90
+ - July 2021: New pre-trained model ESM-1v released, trained on UniRef90 (see [Meier et al. 2021](https://doi.org/10.1101/2021.07.09.450648)).
91
+ - July 2021: New MSA Transformer released, with a minor fix in the row positional embeddings (`ESM-MSA-1b`).
92
+ - Feb 2021: MSA Transformer added (see [Rao et al. 2021](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1)). Example usage in [notebook](#notebooks).
93
+ - Dec 2020: [Self-Attention Contacts](#notebooks) for all pre-trained models (see [Rao et al. 2020](https://doi.org/10.1101/2020.12.15.422761))
94
+ - Dec 2020: Added new pre-trained model [ESM-1b](#perf_related) (see [Rives et al. 2019](https://doi.org/10.1101/622803) Appendix B)
95
+ - Dec 2020: [ESM Structural Split Dataset](#available-esmssd) (see [Rives et al. 2019](https://doi.org/10.1101/622803) Appendix A.10)
96
+
97
+ </details>
98
+
99
+ ## Main models you should use <a name="main-models"></a>
100
+
101
+ | Shorthand | `esm.pretrained.` | Dataset | Description |
102
+ |-----------|-----------------------------|---------|--------------|
103
+ | ESM-2 | `esm2_t36_3B_UR50D()` `esm2_t48_15B_UR50D()` | UR50 (sample UR90) | SOTA general-purpose protein language model. Can be used to predict structure, function and other protein properties directly from individual sequences. Released with [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574) (Aug 2022 update). |
104
+ | ESMFold | `esmfold_v1()` | PDB + UR50 | End-to-end single sequence 3D structure predictor (Nov 2022 update). |
105
+ | ESM-MSA-1b| `esm_msa1b_t12_100M_UR50S()` | UR50 + MSA | MSA Transformer language model. Can be used to extract embeddings from an MSA. Enables SOTA inference of structure. Released with [Rao et al. 2021](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v2) (ICML'21 version, June 2021). |
106
+ | ESM-1v | `esm1v_t33_650M_UR90S_1()` ... `esm1v_t33_650M_UR90S_5()`| UR90 | Language model specialized for prediction of variant effects. Enables SOTA zero-shot prediction of the functional effects of sequence variations. Same architecture as ESM-1b, but trained on UniRef90. Released with [Meier et al. 2021](https://doi.org/10.1101/2021.07.09.450648). |
107
+ | ESM-IF1 | `esm_if1_gvp4_t16_142M_UR50()` | CATH + UR50 | Inverse folding model. Can be used to design sequences for given structures, or to predict functional effects of sequence variation for given structures. Enables SOTA fixed backbone sequence design. Released with [Hsu et al. 2022](https://doi.org/10.1101/2022.04.10.487779). |
108
+
109
+ For a complete list of available models, with details and release notes, see [Pre-trained Models](#available-models).
110
+
111
+
112
+ ## Usage <a name="usage"></a>
113
+
114
+ ### Quick start <a name="quickstart"></a>
115
+
116
+ An easy way to get started is to load ESM or ESMFold through the [HuggingFace transformers library](https://huggingface.co/docs/transformers/model_doc/esm),
117
+ which has simplified the ESMFold dependencies and provides a standardized API and tools to work with state-of-the-art pretrained models.
118
+
119
+ Alternatively, [ColabFold](https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/ESMFold.ipynb) has integrated ESMFold so that you can
120
+ easily run it directly in the browser on a Google Colab instance.
121
+
122
+ We also provide an API which you can access through curl or on [the ESM Metagenomic Atlas web page](https://esmatlas.com/resources?action=fold).
123
+ ```
124
+ curl -X POST --data "KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL" https://api.esmatlas.com/foldSequence/v1/pdb/
125
+ ```
126
+
127
+ For ESM-MSA-1b, ESM-IF1, or any of the other models you can use the original implementation from our repo directly via the instructions below.
128
+
129
+ ### Getting started with this repo <a name="repostart"></a>
130
+
131
+ As a prerequisite, you must have PyTorch installed to use this repository.
132
+
133
+ You can use this one-liner for installation, using the latest release of esm:
134
+
135
+ ```bash
136
+ pip install fair-esm # latest release, OR:
137
+ pip install git+https://github.com/facebookresearch/esm.git # bleeding edge, current repo main branch
138
+ ```
139
+
140
+ To use the ESMFold model, make sure you start from an environment with python <= 3.9 and pytorch installed.
141
+ Then add the `[esmfold]` option to your pip install, which will install the dependencies for OpenFold
142
+ automatically. Openfold installation requires `nvcc`.
143
+
144
+ ```bash
145
+ pip install "fair-esm[esmfold]"
146
+ # OpenFold and its remaining dependency
147
+ pip install 'dllogger @ git+https://github.com/NVIDIA/dllogger.git'
148
+ pip install 'openfold @ git+https://github.com/aqlaboratory/openfold.git@4b41059694619831a7db195b7e0988fc4ff3a307'
149
+ ```
150
+
151
+ **NOTE**: If openfold installation fails, please double check that `nvcc` is available and that a cuda-compatable version of PyTorch has been installed.
152
+
153
+ Alternatively, we provide the `esmfold` conda environment, which can be built via `conda env create -f environment.yml`.
154
+
155
+ We also support PyTorch Hub, which removes the need to clone and/or install this repository yourself:
156
+
157
+ ```python
158
+ import torch
159
+ model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")
160
+ ```
161
+
162
+ After pip install, you can load and use a pretrained model as follows:
163
+
164
+ ```python
165
+ import torch
166
+ import esm
167
+
168
+ # Load ESM-2 model
169
+ model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
170
+ batch_converter = alphabet.get_batch_converter()
171
+ model.eval() # disables dropout for deterministic results
172
+
173
+ # Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
174
+ data = [
175
+ ("protein1", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"),
176
+ ("protein2", "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
177
+ ("protein2 with mask","KALTARQQEVFDLIRD<mask>ISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE"),
178
+ ("protein3", "K A <mask> I S Q"),
179
+ ]
180
+ batch_labels, batch_strs, batch_tokens = batch_converter(data)
181
+ batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
182
+
183
+ # Extract per-residue representations (on CPU)
184
+ with torch.no_grad():
185
+ results = model(batch_tokens, repr_layers=[33], return_contacts=True)
186
+ token_representations = results["representations"][33]
187
+
188
+ # Generate per-sequence representations via averaging
189
+ # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
190
+ sequence_representations = []
191
+ for i, tokens_len in enumerate(batch_lens):
192
+ sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
193
+
194
+ # Look at the unsupervised self-attention map contact predictions
195
+ import matplotlib.pyplot as plt
196
+ for (_, seq), tokens_len, attention_contacts in zip(data, batch_lens, results["contacts"]):
197
+ plt.matshow(attention_contacts[: tokens_len, : tokens_len])
198
+ plt.title(seq)
199
+ plt.show()
200
+ ```
201
+
202
+
203
+ ### ESMFold Structure Prediction <a name="esmfold"></a>
204
+
205
+ After installing with the `[esmfold]` option, you can use the ESMFold structure prediction model as follows:
206
+
207
+ ```python
208
+ import torch
209
+ import esm
210
+
211
+ model = esm.pretrained.esmfold_v1()
212
+ model = model.eval().cuda()
213
+
214
+ # Optionally, uncomment to set a chunk size for axial attention. This can help reduce memory.
215
+ # Lower sizes will have lower memory requirements at the cost of increased speed.
216
+ # model.set_chunk_size(128)
217
+
218
+ sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
219
+ # Multimer prediction can be done with chains separated by ':'
220
+
221
+ with torch.no_grad():
222
+ output = model.infer_pdb(sequence)
223
+
224
+ with open("result.pdb", "w") as f:
225
+ f.write(output)
226
+
227
+ import biotite.structure.io as bsio
228
+ struct = bsio.load_structure("result.pdb", extra_fields=["b_factor"])
229
+ print(struct.b_factor.mean()) # this will be the pLDDT
230
+ # 88.3
231
+ ```
232
+
233
+
234
+ Besides `esm.pretrained.esmfold_v1()` which is the best performing model we recommend using, we
235
+ also provide `esm.pretrained.esmfold_v0()` which was used for the experiments in
236
+ [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574).
237
+
238
+ We also provide a command line interface (`esm-fold`) that efficiently predicts structures in bulk from a FASTA file using ESMFold:
239
+ ```
240
+ usage: esm-fold [-h] -i FASTA -o PDB [--num-recycles NUM_RECYCLES]
241
+ [--max-tokens-per-batch MAX_TOKENS_PER_BATCH]
242
+ [--chunk-size CHUNK_SIZE] [--cpu-only] [--cpu-offload]
243
+
244
+ optional arguments:
245
+ -h, --help show this help message and exit
246
+ -i FASTA, --fasta FASTA
247
+ Path to input FASTA file
248
+ -o PDB, --pdb PDB Path to output PDB directory
249
+ --num-recycles NUM_RECYCLES
250
+ Number of recycles to run. Defaults to number used in
251
+ training (4).
252
+ --max-tokens-per-batch MAX_TOKENS_PER_BATCH
253
+ Maximum number of tokens per gpu forward-pass. This
254
+ will group shorter sequences together for batched
255
+ prediction. Lowering this can help with out of memory
256
+ issues, if these occur on short sequences.
257
+ --chunk-size CHUNK_SIZE
258
+ Chunks axial attention computation to reduce memory
259
+ usage from O(L^2) to O(L). Equivalent to running a for
260
+ loop over chunks of of each dimension. Lower values
261
+ will result in lower memory usage at the cost of
262
+ speed. Recommended values: 128, 64, 32. Default: None.
263
+ --cpu-only CPU only
264
+ --cpu-offload Enable CPU offloading
265
+ ```
266
+
267
+ The command will make one prediction for every sequence in the fasta file. Multimers can be predicted and should be entered in the fasta file as a single sequence, with chains seprated by a ":" character.
268
+
269
+ By default, predictions will be batched together so that shorter sequences are predicted simultaneously. This can be disabled by setting `--max-tokens-per-batch=0`. Batching can significantly improve prediction speed on shorter sequences.
270
+
271
+ The `--cpu-offload` flag can be useful for making predictions on longer sequences. It will attempt to offload some parameters to the CPU RAM, rather than storing on GPU.
272
+
273
+ Finally, the ablation experiments for LMs of varying sizes [Lin et al. 2022 table S1](https://www.science.org/doi/abs/10.1126/science.ade2574) are released as `esm.pretrained.esmfold_structure_module_only_*()`. We don't recommend using these models for structure prediction.
274
+
275
+
276
+ ### Compute embeddings in bulk from FASTA <a name="bulk_fasta"></a>
277
+
278
+ We provide a command line interface (`esm-extract`) that efficiently extracts embeddings in bulk for a FASTA file from the ESM:
279
+ ```
280
+ usage: esm-extract [-h] [--toks_per_batch TOKS_PER_BATCH]
281
+ [--repr_layers REPR_LAYERS [REPR_LAYERS ...]] --include
282
+ {mean,per_tok,bos,contacts}
283
+ [{mean,per_tok,bos,contacts} ...]
284
+ [--truncation_seq_length TRUNCATION_SEQ_LENGTH]
285
+ model_location fasta_file output_dir
286
+
287
+ Extract per-token representations and model outputs for sequences in a FASTA
288
+ file
289
+
290
+ positional arguments:
291
+ model_location PyTorch model file OR name of pretrained model to
292
+ download (see README for models)
293
+ fasta_file FASTA file on which to extract representations
294
+ output_dir output directory for extracted representations
295
+
296
+ optional arguments:
297
+ -h, --help show this help message and exit
298
+ --toks_per_batch TOKS_PER_BATCH
299
+ maximum batch size
300
+ --repr_layers REPR_LAYERS [REPR_LAYERS ...]
301
+ layers indices from which to extract representations
302
+ (0 to num_layers, inclusive)
303
+ --include {mean,per_tok,bos,contacts} [{mean,per_tok,bos,contacts} ...]
304
+ specify which representations to return
305
+ --truncation_seq_length TRUNCATION_SEQ_LENGTH
306
+ truncate sequences longer than the given value
307
+ ```
308
+
309
+ The following commands allow the extraction of the final-layer embedding for a FASTA file from the ESM-2 model:
310
+
311
+ ```bash
312
+ esm-extract esm2_t33_650M_UR50D examples/data/some_proteins.fasta \
313
+ examples/data/some_proteins_emb_esm2 --repr_layers 0 32 33 --include
314
+ ```
315
+ ```bash
316
+ python scripts/extract.py esm2_t33_650M_UR50D examples/data/some_proteins.fasta \
317
+ examples/data/some_proteins_emb_esm2 --repr_layers 0 32 33 --include mean per_tok
318
+ ```
319
+
320
+ A cuda device is optional and will be auto-detected.
321
+
322
+ Directory `some_proteins_emb_esm2/` now contains one `.pt` file per FASTA sequence; use `torch.load()` to load them.
323
+ `scripts/extract.py` has flags that determine what's included in the `.pt` file:
324
+ * `--repr-layers` (default: final only) selects which layers to include embeddings from.
325
+ * `--include` specifies what embeddings to save. You can use the following:
326
+ * `per_tok` includes the full sequence, with an embedding per amino acid (seq_len x hidden_dim).
327
+ * `mean` includes the embeddings averaged over the full sequence, per layer.
328
+ * `bos` includes the embeddings from the beginning-of-sequence token.
329
+ (NOTE: Don't use with the pre-trained models - we trained without bos-token supervision)
330
+
331
+
332
+ ### CPU offloading for inference with large models <a name="fsdp"></a>
333
+ If you want to load very large models like 15B and/or do inference on long sequences on your machine, regular GPU inference may lead to OOM errors.
334
+ We show how to load the model with Fairscale's [Fully Sharded Data Parallel (FSDP)](https://fairscale.readthedocs.io/en/stable/api/nn/fsdp.html) and
335
+ use its CPU offloading feature.
336
+ This allows to do inference of large models on a single GPU.
337
+ Please check out `examples/esm2_infer_fairscale_fsdp_cpu_offloading.py` for more details.
338
+
339
+ ### Zero-shot variant prediction <a name="zs_variant"></a>
340
+ See "[examples/variant-prediction/](examples/variant-prediction/)" for code and pre-trained weights for the ESM-1v models described in
341
+ [Language models enable zero-shot prediction of the effects of mutations on protein function. (Meier et al. 2021)](https://doi.org/10.1101/2021.07.09.450648).
342
+
343
+ Note that ESM-2 could be used for variant prediction as well, and is expected to have similar performance to ESM-1v.
344
+
345
+ ### Inverse folding <a name="invf"></a>
346
+ See "[examples/inverse_folding/](examples/inverse_folding/)" for detailed user guide. The ESM-IF1 model is described as `GVPTransformer` in [Learning inverse folding from millions of predicted structures. (Hsu et al. 2022)](https://doi.org/10.1101/2022.04.10.487779).
347
+
348
+ We also provide a colab notebook for the sequence design and sequence scoring functionalities.
349
+
350
+ [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/inverse_folding/notebook_multichain.ipynb)
351
+
352
+ The ESM-IF1 inverse folding model is built for predicting protein sequences
353
+ from their backbone atom coordinates. We provide scripts here 1) to sample sequence
354
+ designs for a given structure and 2) to score sequences for a given structure.
355
+
356
+ Trained with 12M protein structures predicted by AlphaFold2, the ESM-IF1
357
+ model consists of invariant geometric input processing layers followed by a
358
+ sequence-to-sequence transformer, and achieves 51% native sequence recovery on
359
+ structurally held-out backbones with 72% recovery for buried residues.
360
+ The model is also trained with span masking to tolerate missing backbone
361
+ coordinates and therefore can predict sequences for partially masked structures.
362
+
363
+ #### Sample sequence designs for a given structure
364
+ The environment setup is described in [this subsection of examples/inverse_folding](examples/inverse_folding#recommended-environment).
365
+
366
+ To sample sequences for a given structure in PDB or mmCIF format, use the
367
+ `sample_sequences.py` script. The input file can have either `.pdb` or
368
+ `.cif` as suffix.
369
+
370
+ For example, to sample 3 sequence designs for the golgi casein kinase structure
371
+ (PDB [5YH2](https://www.rcsb.org/structure/5yh2); [PDB Molecule of the Month
372
+ from January 2022](https://pdb101.rcsb.org/motm/265)), we can run the following
373
+ command from the esm root directory:
374
+ ```bash
375
+ python examples/inverse_folding/sample_sequences.py examples/inverse_folding/data/5YH2.pdb \
376
+ --chain C --temperature 1 --num-samples 3 --outpath examples/inverse_folding/output/sampled_sequences.fasta
377
+ ```
378
+
379
+ The sampled sequences will be saved in a fasta format to the specified output file.
380
+
381
+ The temperature parameter controls the sharpness of the probability
382
+ distribution for sequence sampling. Higher sampling temperatures yield more
383
+ diverse sequences but likely with lower native sequence recovery.
384
+ The default sampling temperature is 1. To optimize for native sequence
385
+ recovery, we recommend sampling with low temperature such as 1e-6.
386
+
387
+ #### Scoring sequences
388
+ To score the conditional log-likelihoods for sequences conditioned on a given
389
+ structure, use the `score_log_likelihoods.py` script.
390
+
391
+ For example, to score the sequences in `examples/inverse_folding/data/5YH2_mutated_seqs.fasta`
392
+ according to the structure in `examples/inverse_folding/data/5YH2.pdb`, we can run
393
+ the following command from the esm root directory:
394
+ ```
395
+ python examples/inverse_folding/score_log_likelihoods.py examples/inverse_folding/data/5YH2.pdb \
396
+ examples/inverse_folding/data/5YH2_mutated_seqs.fasta --chain C \
397
+ --outpath examples/inverse_folding/output/5YH2_mutated_seqs_scores.csv
398
+ ```
399
+
400
+ The conditional log-likelihoods are saved in a csv format in the specified output path.
401
+ The output values are the average log-likelihoods averaged over all amino acids in a sequence.
402
+
403
+ For more information, see "[./examples/inverse_folding/](examples/inverse_folding/)" for detailed user guide.
404
+
405
+ ## ESM Metagenomic Atlas <a name="atlas"></a>
406
+
407
+ Please visit the [ESM Metagenomic Atlas](https://esmatlas.com/) website, and
408
+ see our [blog post](https://ai.facebook.com/blog/protein-folding-esmfold-metagenomics/) to learn more.
409
+
410
+ Bulk download instructions available at a seperate README [here](scripts/atlas/README.md).
411
+
412
+ The Atlas resources include a page to [fold a sequence using ESMFold](https://esmatlas.com/resources?action=fold),
413
+ searching a subset of the ESM Atlas by [structure](https://esmatlas.com/resources?action=search_structure) or
414
+ [sequence](https://esmatlas.com/resources?action=search_sequence),
415
+ as well as an [API](https://esmatlas.com/about#api) to access those resources programmatically.
416
+
417
+ Foldseek provides search against the Atlas without the length limitation [here](https://search.foldseek.com/search).
418
+
419
+
420
+ ## Notebooks <a name="notebooks"></a>
421
+
422
+ ### Inverse folding - predicting or scoring sequences based on backbone structures
423
+
424
+ [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/inverse_folding/notebook.ipynb)
425
+
426
+ The ESM-IF1 inverse folding model predicts protein sequences from their backbone atom coordinates, trained with 12M protein structures predicted by AlphaFold2.
427
+ This notetook guide you through examples of sampling sequences, calculating conditional log-likelihoods, and extracting encoder output as structure representation.
428
+
429
+ ### Supervised variant prediction - training a classifier on the embeddings
430
+
431
+ [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/sup_variant_prediction.ipynb)
432
+
433
+
434
+ To help you get started with using the embeddings, this [jupyter notebook tutorial](examples/sup_variant_prediction.ipynb) shows how to train a supervised variant predictor using embeddings from ESM-1.
435
+ You can adopt a similar protocol to train a model for any downstream task, even with limited data.
436
+ First you can obtain the embeddings for ``examples/data/P62593.fasta`` either by [downloading the precomputed](https://dl.fbaipublicfiles.com/fair-esm/examples/P62593_reprs.tar.gz) embeddings
437
+ as instructed in the notebook or by running the following:
438
+
439
+ ```bash
440
+ # Obtain the embeddings
441
+ python scripts/extract.py esm1v_t33_650M_UR90S_1 examples/data/P62593.fasta \
442
+ examples/data/P62593_emb_esm1v --repr_layers 33 --include mean
443
+ ```
444
+
445
+ Then, follow the remaining instructions in the tutorial. You can also run the tutorial in a [colab notebook](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/sup_variant_prediction.ipynb).
446
+
447
+ **Note, alternatively use [the newer instructions for zero-shot variant prediction](examples/variant-prediction/),
448
+ which predicts mutational effects without any supervised training.**
449
+
450
+
451
+ ### Unsupervised contact prediction
452
+ [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/contact_prediction.ipynb)
453
+
454
+ This [jupyter notebook tutorial](examples/contact_prediction.ipynb) demonstrates contact prediction with both the ESM-2 and MSA Transformer (ESM-MSA-1) models.
455
+ Contact prediction is based on a logistic regression over the model's attention maps.
456
+ This methodology is based on our ICLR 2021 paper,
457
+ [Transformer protein language models are unsupervised structure learners. (Rao et al. 2020)](https://doi.org/10.1101/2020.12.15.422761)
458
+ The MSA Transformer (ESM-MSA-1) takes a multiple sequence alignment (MSA) as input, and uses the tied row self-attention maps in the same way.
459
+ See [MSA Transformer. (Rao et al. 2021)](https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1).
460
+
461
+ To get unsupervised attention-based contacts, call `model.predict_contacts(tokens)` or `model(tokens, return_contacts=True)`.
462
+
463
+
464
+ ### ESMStructuralSplitDataset and self-attention contact prediction
465
+ [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/facebookresearch/esm/blob/main/examples/esm_structural_dataset.ipynb)
466
+
467
+ And this [jupyter notebook tutorial](examples/esm_structural_dataset.ipynb) shows how to load and index the `ESMStructuralSplitDataset`,
468
+ and computes the self-attention map unsupervised contact predictions using ESM-2.
469
+
470
+
471
+ ## Available Models and Datasets <a name="available"></a>
472
+
473
+ ### Pre-trained Models <a name="available-models"></a>
474
+
475
+ | Shorthand | `esm.pretrained.` | #layers | #params | Dataset | Embedding Dim | Model URL (automatically downloaded to `~/.cache/torch/hub/checkpoints`) |
476
+ |-----------|---------------------|---------|-------------|---------|---------------|-----------------------------------------------------------------------|
477
+ | ESM-2 | `esm2_t48_15B_UR50D` | 48 | 15B | UR50/D 2021_04 | 5120 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t48_15B_UR50D.pt |
478
+ | | `esm2_t36_3B_UR50D` | 36 | 3B | UR50/D 2021_04 | 2560 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t36_3B_UR50D.pt |
479
+ | | `esm2_t33_650M_UR50D` | 33 | 650M | UR50/D 2021_04 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt |
480
+ | | `esm2_t30_150M_UR50D` | 30 | 150M | UR50/D 2021_04 | 640 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t30_150M_UR50D.pt |
481
+ | | `esm2_t12_35M_UR50D` | 12 | 35M | UR50/D 2021_04 | 480 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t12_35M_UR50D.pt |
482
+ | | `esm2_t6_8M_UR50D` | 6 | 8M | UR50/D 2021_04 | 320 | https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt |
483
+ | ESMFold | `esmfold_v1` | 48 (+36) | 690M (+3B) | UR50/D 2021_04 | - | https://dl.fbaipublicfiles.com/fair-esm/models/esmfold_3B_v1.pt |
484
+ | | `esmfold_v0` | 48 (+36) | 690M (+3B) | UR50/D 2021_04 | - | https://dl.fbaipublicfiles.com/fair-esm/models/esmfold_3B_v0.pt |
485
+ | | `esmfold_structure_module_only_*` | 0 (+various) | various | UR50/D 2021_04 | - | https://dl.fbaipublicfiles.com/fair-esm/models/esmfold_structure_module_only_* |
486
+ | ESM-IF1 | `esm_if1_gvp4_t16_142M_UR50` | 20 | 124M | CATH 4.3 + predicted structures for UR50 | 512 | https://dl.fbaipublicfiles.com/fair-esm/models/esm_if1_gvp4_t16_142M_UR50.pt |
487
+ | ESM-1v | `esm1v_t33_650M_UR90S_[1-5]` | 33 | 650M | UR90/S 2020_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_1.pt |
488
+ | ESM-MSA-1b| `esm_msa1b_t12_100M_UR50S` | 12 | 100M | UR50/S + MSA 2018_03 | 768 | https://dl.fbaipublicfiles.com/fair-esm/models/esm_msa1b_t12_100M_UR50S.pt |
489
+ | ESM-MSA-1 | `esm_msa1_t12_100M_UR50S` | 12 | 100M | UR50/S + MSA 2018_03 | 768 | https://dl.fbaipublicfiles.com/fair-esm/models/esm_msa1_t12_100M_UR50S.pt |
490
+ | ESM-1b | `esm1b_t33_650M_UR50S` | 33 | 650M | UR50/S 2018_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1b_t33_650M_UR50S.pt |
491
+ | ESM-1 | `esm1_t34_670M_UR50S` | 34 | 670M | UR50/S 2018_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t34_670M_UR50S.pt |
492
+ | | `esm1_t34_670M_UR50D` | 34 | 670M | UR50/D 2018_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t34_670M_UR50D.pt |
493
+ | | `esm1_t34_670M_UR100` | 34 | 670M | UR100 2018_03 | 1280 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t34_670M_UR100.pt |
494
+ | | `esm1_t12_85M_UR50S` | 12 | 85M | UR50/S 2018_03 | 768 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t12_85M_UR50S.pt |
495
+ | | `esm1_t6_43M_UR50S` | 6 | 43M | UR50/S 2018_03 | 768 | https://dl.fbaipublicfiles.com/fair-esm/models/esm1_t6_43M_UR50S.pt |
496
+
497
+
498
+ Here is a chronological list of the released models and the paper they were introduced in:
499
+
500
+ | Shorthand | Release Notes |
501
+ |------------|---------------|
502
+ | ESM-1 | Released with Rives et al. 2019 (Aug 2020 update). |
503
+ | ESM-1b | Released with Rives et al. 2019 (Dec 2020 update). See Appendix B. |
504
+ | ESM-MSA-1 | Released with Rao et al. 2021 (Preprint v1). |
505
+ | ESM-MSA-1b | Released with Rao et al. 2021 (ICML'21 version, June 2021). |
506
+ | ESM-1v | Released with Meier et al. 2021. |
507
+ | ESM-IF1 | Released with Hsu et al. 2022. |
508
+ | ESM-2 | Released with Lin et al. 2022. |
509
+
510
+ ### ESM Structural Split Dataset <a name="available-esmssd"></a>
511
+ This is a five-fold cross validation dataset of protein domain structures that can be used to measure generalization of representations
512
+ across different levels of structural dissimilarity.
513
+ The dataset implements structural holdouts at the family, superfamily, and fold
514
+ level. The SCOPe database is used to classify domains. Independently for each level of structural hold-out,
515
+ the domains are split into 5 equal sets, i.e. five sets of folds, superfamilies, or families. This ensures
516
+ that for each of the five partitions, structures having the same classification do not appear in both the
517
+ train and test sets. For a given classification level each structure appears in a test set once, so that
518
+ in the cross validation experiment each of the structures will be evaluated exactly once.
519
+
520
+ The dataset provides 3d coordinates, distance maps, and secondary structure labels.
521
+ For further details on the construction of the dataset
522
+ see [Rives et al. 2019](https://doi.org/10.1101/622803) Appendix A.10.
523
+
524
+ This [jupyter notebook tutorial](examples/esm_structural_dataset.ipynb) shows how to load and index the `ESMStructuralSplitDataset`.
525
+
526
+ `ESMStructuralSplitDataset`, upon initializing, will download `splits` and `pkl`.
527
+ We also provide `msas` for each of the domains. The data can be directly downloaded below.
528
+
529
+ | Name | Description | URL |
530
+ |--------|-------------------------------------------------------------------------------|-----------------------------------------------------------------------|
531
+ | splits | train/valid splits | https://dl.fbaipublicfiles.com/fair-esm/structural-data/splits.tar.gz |
532
+ | pkl | pkl objects containing sequence, SSP labels, distance map, and 3d coordinates | https://dl.fbaipublicfiles.com/fair-esm/structural-data/pkl.tar.gz |
533
+ | msas | a3m files containing MSA for each domain | https://dl.fbaipublicfiles.com/fair-esm/structural-data/msas.tar.gz |
534
+
535
+ ### Pre-training Dataset Split <a name="available-pretraining-split"></a>
536
+ The split files establishing which UniRef50 clusters were used as held-out evaluation set for pre-training
537
+ in [Rives et al. 2019](https://doi.org/10.1101/622803) and [Rao et al. 2021](https://doi.org/10.1101/2021.02.12.430858) can be found here:
538
+ * [UniRef50 IDs of evaluation set](https://dl.fbaipublicfiles.com/fair-esm/pretraining-data/uniref201803_ur50_valid_headers.txt.gz): 3.016 M clusters
539
+ * [UniRef100 IDs of evaluation set](https://dl.fbaipublicfiles.com/fair-esm/pretraining-data/uniref201803_ur100_valid_headers.txt.gz): 13.745 M proteins, expanding the same UniRef50 clusters.
540
+
541
+ These files only contain only the UniRef50 IDs and UniRef100 IDs corresponding to the [UniRef database, 2018-03 release](https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2018_03/uniref/)
542
+ which is released by the UniProt Consortium under a [Creative Commons Attribution (CC BY 4.0) License](https://www.uniprot.org/help/license).
543
+
544
+
545
+ ### Comparison to related works <a name="perf_related"></a>
546
+ <!--
547
+ DO NOT EDIT THIS TABLE! This is the source of truth:
548
+ https://docs.google.com/spreadsheets/d/1RPvWF47rIMEr-Jg-SRCoGElHcwCl5d7RyEeSyPgp59A/edit#gid=0
549
+ exported via https://www.tablesgenerator.com/html_tables
550
+ -->
551
+
552
+ <table class="tg">
553
+ <thead>
554
+ <tr>
555
+ <th class="tg-0thz"><span style="font-weight:bold">Task</span></th>
556
+ <th class="tg-j6zm" colspan="3"><span style="font-weight:bold">Unsupervised contact prediction</span></th>
557
+ <th class="tg-j6zm" colspan="2"><span style="font-weight:bold">Structure Prediction</span></th>
558
+ </tr>
559
+ </thead>
560
+ <tbody>
561
+ <tr>
562
+ <td class="tg-j6zm"><span style="font-weight:bold">Test set</span></td>
563
+ <td class="tg-j6zm"><span style="font-weight:bold">Large valid</span></td>
564
+ <td class="tg-j6zm"><span style="font-weight:bold">CASP14</span></td>
565
+ <td class="tg-j6zm"><span style="font-weight:bold">CAMEO (Apr-Jun 2022)</span></td>
566
+ <td class="tg-j6zm"><span style="font-weight:bold">CASP14</span></td>
567
+ <td class="tg-j6zm"><span style="font-weight:bold">CAMEO (Apr-Jun 2022)</span></td>
568
+ </tr>
569
+ <tr>
570
+ <td class="tg-7zrl">Gremlin (Potts)</td>
571
+ <td class="tg-7zrl">39.3</td>
572
+ <td class="tg-7zrl"></td>
573
+ <td class="tg-7zrl"></td>
574
+ <td class="tg-7zrl"></td>
575
+ <td class="tg-7zrl"></td>
576
+ </tr>
577
+ <tr>
578
+ <td class="tg-7zrl">TAPE</td>
579
+ <td class="tg-7zrl">11.2</td>
580
+ <td class="tg-7zrl"></td>
581
+ <td class="tg-7zrl"></td>
582
+ <td class="tg-7zrl"></td>
583
+ <td class="tg-7zrl"></td>
584
+ </tr>
585
+ <tr>
586
+ <td class="tg-7zrl">ProtBert-BFD</td>
587
+ <td class="tg-7zrl">34.1</td>
588
+ <td class="tg-7zrl"></td>
589
+ <td class="tg-7zrl"></td>
590
+ <td class="tg-7zrl"></td>
591
+ <td class="tg-7zrl"></td>
592
+ </tr>
593
+ <tr>
594
+ <td class="tg-7zrl">Prot-T5-XL-BFD</td>
595
+ <td class="tg-7zrl">35.6</td>
596
+ <td class="tg-7zrl"></td>
597
+ <td class="tg-7zrl"></td>
598
+ <td class="tg-2b7s">46.1</td>
599
+ <td class="tg-2b7s">62.6</td>
600
+ </tr>
601
+ <tr>
602
+ <td class="tg-7zrl">Prot-T5-XL-Ur50 (3B)</td>
603
+ <td class="tg-7zrl">47.9</td>
604
+ <td class="tg-7zrl"></td>
605
+ <td class="tg-7zrl"></td>
606
+ <td class="tg-2b7s">49.8</td>
607
+ <td class="tg-2b7s">69.4</td>
608
+ </tr>
609
+ <tr>
610
+ <td class="tg-7zrl">ESM-1</td>
611
+ <td class="tg-7zrl">33.7</td>
612
+ <td class="tg-7zrl"></td>
613
+ <td class="tg-7zrl"></td>
614
+ <td class="tg-7zrl"></td>
615
+ <td class="tg-7zrl"></td>
616
+ </tr>
617
+ <tr>
618
+ <td class="tg-7zrl">ESM-1b</td>
619
+ <td class="tg-7zrl">41.1</td>
620
+ <td class="tg-7zrl">24.4</td>
621
+ <td class="tg-7zrl">39</td>
622
+ <td class="tg-2b7s">41.6</td>
623
+ <td class="tg-2b7s">64.5</td>
624
+ </tr>
625
+ <tr>
626
+ <td class="tg-7zrl">ESM-1v</td>
627
+ <td class="tg-7zrl">35.3</td>
628
+ <td class="tg-7zrl"></td>
629
+ <td class="tg-7zrl"></td>
630
+ <td class="tg-7zrl"></td>
631
+ <td class="tg-7zrl"></td>
632
+ </tr>
633
+ <tr>
634
+ <td class="tg-7zrl">ESM-MSA-1b</td>
635
+ <td class="tg-7zrl">57.4</td>
636
+ <td class="tg-7zrl"></td>
637
+ <td class="tg-7zrl"></td>
638
+ <td class="tg-7zrl"></td>
639
+ <td class="tg-7zrl"></td>
640
+ </tr>
641
+ <tr>
642
+ <td class="tg-7zrl">ESM-2 (8M)</td>
643
+ <td class="tg-7zrl">15.9</td>
644
+ <td class="tg-7zrl">9.8</td>
645
+ <td class="tg-7zrl">15.7</td>
646
+ <td class="tg-2b7s">36.7</td>
647
+ <td class="tg-2b7s">48.1</td>
648
+ </tr>
649
+ <tr>
650
+ <td class="tg-7zrl">ESM-2 (35M)</td>
651
+ <td class="tg-7zrl">28.8</td>
652
+ <td class="tg-7zrl">16.4</td>
653
+ <td class="tg-7zrl">28.4</td>
654
+ <td class="tg-2b7s">41.4</td>
655
+ <td class="tg-2b7s">56.4</td>
656
+ </tr>
657
+ <tr>
658
+ <td class="tg-7zrl">ESM-2 (150M)</td>
659
+ <td class="tg-7zrl">42.2</td>
660
+ <td class="tg-7zrl">26.8</td>
661
+ <td class="tg-7zrl">40.1</td>
662
+ <td class="tg-2b7s">49.0</td>
663
+ <td class="tg-2b7s">64.9</td>
664
+ </tr>
665
+ <tr>
666
+ <td class="tg-7zrl">ESM-2 (700M)</td>
667
+ <td class="tg-7zrl">50.1</td>
668
+ <td class="tg-7zrl">32.5</td>
669
+ <td class="tg-7zrl">47.6</td>
670
+ <td class="tg-2b7s">51.3</td>
671
+ <td class="tg-2b7s">70.1</td>
672
+ </tr>
673
+ <tr>
674
+ <td class="tg-7zrl">ESM-2 (3B)</td>
675
+ <td class="tg-7zrl">52.7</td>
676
+ <td class="tg-7zrl">34.0</td>
677
+ <td class="tg-7zrl">49.9</td>
678
+ <td class="tg-2b7s">52.5</td>
679
+ <td class="tg-2b7s">71.8</td>
680
+ </tr>
681
+ <tr>
682
+ <td class="tg-7zrl">ESM-2 (15B)</td>
683
+ <td class="tg-7zrl">54.5</td>
684
+ <td class="tg-7zrl">37.0</td>
685
+ <td class="tg-7zrl">51.7</td>
686
+ <td class="tg-2b7s">55.4</td>
687
+ <td class="tg-2b7s">72.1</td>
688
+ </tr>
689
+ </tbody>
690
+ </table>
691
+
692
+ Comparison to related protein language models on structure prediction tasks.
693
+
694
+ * All contact numbers are the top-L,LR precision metric, where long range means sequence separation of at least 24 residues
695
+ * For unsupervised contact prediction, a sparse linear combination of the attention heads is used to directly predict protein contacts,
696
+ fitted with logistic regression on 20 structures.
697
+ For more details on the method, see [Rao et al. 2020](https://doi.org/10.1101/2020.12.15.422761).
698
+ * For structure prediction, an AlphaFold2 structure module is trained directly from the frozen language model embeddings.
699
+ For more details on the method, see [Lin et al. 2022](https://www.science.org/doi/abs/10.1126/science.ade2574).
700
+ * Direct coupling analysis methods (Gremlin, mfDCA, Psicov) and ESM-MSA-1 use the [trRosetta MSAs](https://yanglab.nankai.edu.cn/trRosetta/benchmark/), while other methods predict from single sequence.
701
+
702
+
703
+ ## Citations <a name="citations"></a>
704
+
705
+ If you find the models useful in your research, we ask that you cite the relevant paper:
706
+
707
+ ```bibtex
708
+ @article{rives2019biological,
709
+ author={Rives, Alexander and Meier, Joshua and Sercu, Tom and Goyal, Siddharth and Lin, Zeming and Liu, Jason and Guo, Demi and Ott, Myle and Zitnick, C. Lawrence and Ma, Jerry and Fergus, Rob},
710
+ title={Biological Structure and Function Emerge from Scaling Unsupervised Learning to 250 Million Protein Sequences},
711
+ year={2019},
712
+ doi={10.1101/622803},
713
+ url={https://www.biorxiv.org/content/10.1101/622803v4},
714
+ journal={PNAS}
715
+ }
716
+ ```
717
+
718
+ For the self-attention contact prediction:
719
+
720
+ ```bibtex
721
+ @article{rao2020transformer,
722
+ author = {Rao, Roshan M and Meier, Joshua and Sercu, Tom and Ovchinnikov, Sergey and Rives, Alexander},
723
+ title={Transformer protein language models are unsupervised structure learners},
724
+ year={2020},
725
+ doi={10.1101/2020.12.15.422761},
726
+ url={https://www.biorxiv.org/content/10.1101/2020.12.15.422761v1},
727
+ journal={bioRxiv}
728
+ }
729
+ ```
730
+
731
+ For the MSA Transformer:
732
+
733
+ ```bibtex
734
+ @article{rao2021msa,
735
+ author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier, Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom and Rives, Alexander},
736
+ title={MSA Transformer},
737
+ year={2021},
738
+ doi={10.1101/2021.02.12.430858},
739
+ url={https://www.biorxiv.org/content/10.1101/2021.02.12.430858v1},
740
+ journal={bioRxiv}
741
+ }
742
+ ```
743
+
744
+ For variant prediction using ESM-1v:
745
+
746
+ ```bibtex
747
+ @article{meier2021language,
748
+ author = {Meier, Joshua and Rao, Roshan and Verkuil, Robert and Liu, Jason and Sercu, Tom and Rives, Alexander},
749
+ title = {Language models enable zero-shot prediction of the effects of mutations on protein function},
750
+ year={2021},
751
+ doi={10.1101/2021.07.09.450648},
752
+ url={https://www.biorxiv.org/content/10.1101/2021.07.09.450648v1},
753
+ journal={bioRxiv}
754
+ }
755
+ ```
756
+
757
+ For inverse folding using ESM-IF1:
758
+
759
+ ```bibtex
760
+ @article{hsu2022learning,
761
+ author = {Hsu, Chloe and Verkuil, Robert and Liu, Jason and Lin, Zeming and Hie, Brian and Sercu, Tom and Lerer, Adam and Rives, Alexander},
762
+ title = {Learning inverse folding from millions of predicted structures},
763
+ year = {2022},
764
+ doi = {10.1101/2022.04.10.487779},
765
+ url = {https://www.biorxiv.org/content/early/2022/04/10/2022.04.10.487779},
766
+ journal = {ICML}
767
+ }
768
+ ```
769
+
770
+ For the ESM-2 language model and ESMFold:
771
+
772
+ ```bibtex
773
+ @article{lin2022language,
774
+ title={Language models of protein sequences at the scale of evolution enable accurate structure prediction},
775
+ author={Lin, Zeming and Akin, Halil and Rao, Roshan and Hie, Brian and Zhu, Zhongkai and Lu, Wenting and Smetanin, Nikita and dos Santos Costa, Allan and Fazel-Zarandi, Maryam and Sercu, Tom and Candido, Sal and others},
776
+ journal={bioRxiv},
777
+ year={2022},
778
+ publisher={Cold Spring Harbor Laboratory}
779
+ }
780
+ ```
781
+
782
+ Much of this code builds on the [fairseq](https://github.com/pytorch/fairseq) sequence modeling framework. We use fairseq internally for our protein language modeling research. We highly recommend trying it out if you'd like to pre-train protein language models from scratch.
783
+
784
+ Additionally, if you would like to use the variant prediction benchmark from Meier et al. (2021), we provide a bibtex file with citations for all data in [./examples/variant-prediction/mutation_data.bib](./examples/variant-prediction/mutation_data.bib). You can cite each paper individually, or add all citations in bulk using the LaTeX command:
785
+
786
+ ```tex
787
+ \nocite{wrenbeck2017deep,klesmith2015comprehensive,haddox2018mapping,romero2015dissecting,firnberg2014comprehensive,deng2012deep,stiffler2015evolvability,jacquier2013capturing,findlay2018comprehensive,mclaughlin2012spatial,kitzman2015massively,doud2016accurate,pokusaeva2019experimental,mishra2016systematic,kelsic2016rna,melnikov2014comprehensive,brenan2016phenotypic,rockah2015systematic,wu2015functional,aakre2015evolving,qi2014quantitative,matreyek2018multiplex,bandaru2017deconstruction,roscoe2013analyses,roscoe2014systematic,mavor2016determination,chan2017correlation,melamed2013deep,starita2013activity,araya2012fundamental}
788
+ ```
789
+
790
+ ## License <a name="license"></a>
791
+
792
+ This source code is licensed under the MIT license found in the `LICENSE` file
793
+ in the root directory of this source tree.
794
+
795
+ ESM Metagenomic Atlas (also referred to as “ESM Metagenomic Structure Atlas” or “ESM Atlas”) data is available under a CC BY 4.0 license for academic and commercial use. Copyright (c) Meta Platforms, Inc. All Rights Reserved. Use of the ESM Metagenomic Atlas data is subject to the Meta Open Source [Terms of Use](https://opensource.fb.com/legal/terms/) and [Privacy Policy](https://opensource.fb.com/legal/privacy/).
esm/source/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ esm 项目包初始化文件
4
+ """
esm/source/environment.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: esmfold
2
+ channels:
3
+ - conda-forge
4
+ - bioconda
5
+ - pytorch
6
+ dependencies:
7
+ - conda-forge::python=3.7
8
+ - conda-forge::setuptools=59.5.0
9
+ - conda-forge::pip
10
+ - conda-forge::openmm=7.5.1
11
+ - conda-forge::pdbfixer
12
+ - conda-forge::cudatoolkit==11.3.*
13
+ - conda-forge::einops
14
+ - conda-forge::fairscale
15
+ - conda-forge::omegaconf
16
+ - conda-forge::hydra-core
17
+ - conda-forge::pandas
18
+ - conda-forge::pytest
19
+ - bioconda::hmmer==3.3.2
20
+ - bioconda::hhsuite==3.3.0
21
+ - bioconda::kalign2==2.04
22
+ - pytorch::pytorch=1.12.*
23
+ - pip:
24
+ - biopython==1.79
25
+ - deepspeed==0.5.9
26
+ - dm-tree==0.1.6
27
+ - ml-collections==0.1.0
28
+ - numpy==1.21.2
29
+ - PyYAML==5.4.1
30
+ - requests==2.26.0
31
+ - scipy==1.7.1
32
+ - tqdm==4.62.2
33
+ - typing-extensions==3.10.0.2
34
+ - pytorch_lightning==1.5.10
35
+ - wandb==0.12.21
36
+ - git+https://github.com/NVIDIA/dllogger.git
esm/source/esm/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .version import version as __version__ # noqa
7
+
8
+ from .data import Alphabet, BatchConverter, FastaBatchedDataset # noqa
9
+ from .model.esm1 import ProteinBertModel # noqa
10
+ from .model.esm2 import ESM2 # noqa
11
+ from .model.msa_transformer import MSATransformer #noqa
12
+ from . import pretrained # noqa
esm/source/esm/axial_attention.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+ import torch
8
+ import torch.nn as nn
9
+
10
+
11
+ class RowSelfAttention(nn.Module):
12
+ """Compute self-attention over rows of a 2D input."""
13
+
14
+ def __init__(
15
+ self,
16
+ embed_dim,
17
+ num_heads,
18
+ dropout=0.0,
19
+ max_tokens_per_msa: int = 2 ** 16,
20
+ ):
21
+ super().__init__()
22
+ self.num_heads = num_heads
23
+ self.dropout = dropout
24
+ self.head_dim = embed_dim // num_heads
25
+ self.scaling = self.head_dim ** -0.5
26
+ self.max_tokens_per_msa = max_tokens_per_msa
27
+ self.attn_shape = "hnij"
28
+
29
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
30
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
31
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
32
+
33
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
34
+ self.dropout_module = nn.Dropout(dropout)
35
+
36
+ def align_scaling(self, q):
37
+ num_rows = q.size(0)
38
+ return self.scaling / math.sqrt(num_rows)
39
+
40
+ def _batched_forward(
41
+ self,
42
+ x,
43
+ self_attn_mask=None,
44
+ self_attn_padding_mask=None,
45
+ ):
46
+ num_rows, num_cols, batch_size, embed_dim = x.size()
47
+ max_rows = max(1, self.max_tokens_per_msa // num_cols)
48
+ attns = 0
49
+ scaling = self.align_scaling(x)
50
+ for start in range(0, num_rows, max_rows):
51
+ attn_weights = self.compute_attention_weights(
52
+ x[start : start + max_rows],
53
+ scaling,
54
+ self_attn_mask=self_attn_mask,
55
+ self_attn_padding_mask=self_attn_padding_mask[:, start : start + max_rows]
56
+ if self_attn_padding_mask is not None
57
+ else None,
58
+ )
59
+ attns += attn_weights
60
+ attn_probs = attns.softmax(-1)
61
+ attn_probs = self.dropout_module(attn_probs)
62
+
63
+ outputs = []
64
+ for start in range(0, num_rows, max_rows):
65
+ output = self.compute_attention_update(x[start : start + max_rows], attn_probs)
66
+ outputs.append(output)
67
+
68
+ output = torch.cat(outputs, 0)
69
+ return output, attn_probs
70
+
71
+ def compute_attention_weights(
72
+ self,
73
+ x,
74
+ scaling: float,
75
+ self_attn_mask=None,
76
+ self_attn_padding_mask=None,
77
+ ):
78
+ num_rows, num_cols, batch_size, embed_dim = x.size()
79
+ q = self.q_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
80
+ k = self.k_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
81
+ q *= scaling
82
+ if self_attn_padding_mask is not None:
83
+ # Zero out any padded aligned positions - this is important since
84
+ # we take a sum across the alignment axis.
85
+ q *= 1 - self_attn_padding_mask.permute(1, 2, 0).unsqueeze(3).unsqueeze(4).to(q)
86
+
87
+ attn_weights = torch.einsum(f"rinhd,rjnhd->{self.attn_shape}", q, k)
88
+
89
+ if self_attn_mask is not None:
90
+ raise NotImplementedError
91
+ # Mask Size: [B x R x C], Weights Size: [H x B x C x C]
92
+
93
+ if self_attn_padding_mask is not None:
94
+ attn_weights = attn_weights.masked_fill(
95
+ self_attn_padding_mask[:, 0].unsqueeze(0).unsqueeze(2),
96
+ -10000,
97
+ )
98
+
99
+ return attn_weights
100
+
101
+ def compute_attention_update(
102
+ self,
103
+ x,
104
+ attn_probs,
105
+ ):
106
+ num_rows, num_cols, batch_size, embed_dim = x.size()
107
+ v = self.v_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
108
+ context = torch.einsum(f"{self.attn_shape},rjnhd->rinhd", attn_probs, v)
109
+ context = context.contiguous().view(num_rows, num_cols, batch_size, embed_dim)
110
+ output = self.out_proj(context)
111
+ return output
112
+
113
+ def forward(
114
+ self,
115
+ x,
116
+ self_attn_mask=None,
117
+ self_attn_padding_mask=None,
118
+ ):
119
+ num_rows, num_cols, batch_size, embed_dim = x.size()
120
+ if (num_rows * num_cols > self.max_tokens_per_msa) and not torch.is_grad_enabled():
121
+ return self._batched_forward(x, self_attn_mask, self_attn_padding_mask)
122
+ else:
123
+ scaling = self.align_scaling(x)
124
+ attn_weights = self.compute_attention_weights(
125
+ x, scaling, self_attn_mask, self_attn_padding_mask
126
+ )
127
+ attn_probs = attn_weights.softmax(-1)
128
+ attn_probs = self.dropout_module(attn_probs)
129
+ output = self.compute_attention_update(x, attn_probs)
130
+ return output, attn_probs
131
+
132
+
133
+ class ColumnSelfAttention(nn.Module):
134
+ """Compute self-attention over columns of a 2D input."""
135
+
136
+ def __init__(
137
+ self,
138
+ embed_dim,
139
+ num_heads,
140
+ dropout=0.0,
141
+ max_tokens_per_msa: int = 2 ** 16,
142
+ ):
143
+ super().__init__()
144
+
145
+ self.num_heads = num_heads
146
+ self.dropout = dropout
147
+ self.head_dim = embed_dim // num_heads
148
+ self.scaling = self.head_dim ** -0.5
149
+ self.max_tokens_per_msa = max_tokens_per_msa
150
+
151
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
152
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
153
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
154
+
155
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
156
+ self.dropout_module = nn.Dropout(dropout)
157
+
158
+ def _batched_forward(
159
+ self,
160
+ x,
161
+ self_attn_mask=None,
162
+ self_attn_padding_mask=None,
163
+ ):
164
+ num_rows, num_cols, batch_size, embed_dim = x.size()
165
+ max_cols = max(1, self.max_tokens_per_msa // num_rows)
166
+ outputs = []
167
+ attns = []
168
+ for start in range(0, num_cols, max_cols):
169
+ output, attn = self(
170
+ x[:, start : start + max_cols],
171
+ self_attn_mask=self_attn_mask,
172
+ self_attn_padding_mask=self_attn_padding_mask[:, :, start : start + max_cols]
173
+ if self_attn_padding_mask is not None
174
+ else None,
175
+ )
176
+ outputs.append(output)
177
+ attns.append(attn)
178
+ output = torch.cat(outputs, 1)
179
+ attns = torch.cat(attns, 1)
180
+ return output, attns
181
+
182
+ def compute_attention_update(
183
+ self,
184
+ x,
185
+ self_attn_mask=None,
186
+ self_attn_padding_mask=None,
187
+ ):
188
+ num_rows, num_cols, batch_size, embed_dim = x.size()
189
+ if num_rows == 1:
190
+ # if there is only 1 position, this is equivalent and doesn't break with padding
191
+ attn_probs = torch.ones(
192
+ self.num_heads,
193
+ num_cols,
194
+ batch_size,
195
+ num_rows,
196
+ num_rows,
197
+ device=x.device,
198
+ dtype=x.dtype,
199
+ )
200
+ output = self.out_proj(self.v_proj(x))
201
+ else:
202
+ q = self.q_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
203
+ k = self.k_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
204
+ v = self.v_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
205
+ q *= self.scaling
206
+
207
+ attn_weights = torch.einsum("icnhd,jcnhd->hcnij", q, k)
208
+
209
+ if self_attn_mask is not None:
210
+ raise NotImplementedError
211
+ if self_attn_padding_mask is not None:
212
+ attn_weights = attn_weights.masked_fill(
213
+ self_attn_padding_mask.permute(2, 0, 1).unsqueeze(0).unsqueeze(3),
214
+ -10000,
215
+ )
216
+
217
+ attn_probs = attn_weights.softmax(-1)
218
+ attn_probs = self.dropout_module(attn_probs)
219
+ context = torch.einsum("hcnij,jcnhd->icnhd", attn_probs, v)
220
+ context = context.contiguous().view(num_rows, num_cols, batch_size, embed_dim)
221
+ output = self.out_proj(context)
222
+ return output, attn_probs
223
+
224
+ def forward(
225
+ self,
226
+ x,
227
+ self_attn_mask=None,
228
+ self_attn_padding_mask=None,
229
+ ):
230
+ num_rows, num_cols, batch_size, embed_dim = x.size()
231
+ # if False and num_rows * num_cols > 2 ** 14 and not torch.is_grad_enabled():
232
+ if (num_rows * num_cols) > self.max_tokens_per_msa and not torch.is_grad_enabled():
233
+ return self._batched_forward(
234
+ x,
235
+ self_attn_mask,
236
+ self_attn_padding_mask,
237
+ )
238
+ else:
239
+ return self.compute_attention_update(x, self_attn_mask, self_attn_padding_mask)
esm/source/esm/constants.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # fmt: off
7
+ proteinseq_toks = {
8
+ 'toks': ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-']
9
+ }
10
+ # fmt: on
esm/source/esm/data.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import itertools
7
+ import os
8
+ from typing import Sequence, Tuple, List, Union
9
+ import pickle
10
+ import re
11
+ import shutil
12
+ import torch
13
+ from pathlib import Path
14
+ from esm.constants import proteinseq_toks
15
+
16
+ RawMSA = Sequence[Tuple[str, str]]
17
+
18
+
19
+ class FastaBatchedDataset(object):
20
+ def __init__(self, sequence_labels, sequence_strs):
21
+ self.sequence_labels = list(sequence_labels)
22
+ self.sequence_strs = list(sequence_strs)
23
+
24
+ @classmethod
25
+ def from_file(cls, fasta_file):
26
+ sequence_labels, sequence_strs = [], []
27
+ cur_seq_label = None
28
+ buf = []
29
+
30
+ def _flush_current_seq():
31
+ nonlocal cur_seq_label, buf
32
+ if cur_seq_label is None:
33
+ return
34
+ sequence_labels.append(cur_seq_label)
35
+ sequence_strs.append("".join(buf))
36
+ cur_seq_label = None
37
+ buf = []
38
+
39
+ with open(fasta_file, "r") as infile:
40
+ for line_idx, line in enumerate(infile):
41
+ if line.startswith(">"): # label line
42
+ _flush_current_seq()
43
+ line = line[1:].strip()
44
+ if len(line) > 0:
45
+ cur_seq_label = line
46
+ else:
47
+ cur_seq_label = f"seqnum{line_idx:09d}"
48
+ else: # sequence line
49
+ buf.append(line.strip())
50
+
51
+ _flush_current_seq()
52
+
53
+ assert len(set(sequence_labels)) == len(
54
+ sequence_labels
55
+ ), "Found duplicate sequence labels"
56
+
57
+ return cls(sequence_labels, sequence_strs)
58
+
59
+ def __len__(self):
60
+ return len(self.sequence_labels)
61
+
62
+ def __getitem__(self, idx):
63
+ return self.sequence_labels[idx], self.sequence_strs[idx]
64
+
65
+ def get_batch_indices(self, toks_per_batch, extra_toks_per_seq=0):
66
+ sizes = [(len(s), i) for i, s in enumerate(self.sequence_strs)]
67
+ sizes.sort()
68
+ batches = []
69
+ buf = []
70
+ max_len = 0
71
+
72
+ def _flush_current_buf():
73
+ nonlocal max_len, buf
74
+ if len(buf) == 0:
75
+ return
76
+ batches.append(buf)
77
+ buf = []
78
+ max_len = 0
79
+
80
+ for sz, i in sizes:
81
+ sz += extra_toks_per_seq
82
+ if max(sz, max_len) * (len(buf) + 1) > toks_per_batch:
83
+ _flush_current_buf()
84
+ max_len = max(max_len, sz)
85
+ buf.append(i)
86
+
87
+ _flush_current_buf()
88
+ return batches
89
+
90
+
91
+ class Alphabet(object):
92
+ def __init__(
93
+ self,
94
+ standard_toks: Sequence[str],
95
+ prepend_toks: Sequence[str] = ("<null_0>", "<pad>", "<eos>", "<unk>"),
96
+ append_toks: Sequence[str] = ("<cls>", "<mask>", "<sep>"),
97
+ prepend_bos: bool = True,
98
+ append_eos: bool = False,
99
+ use_msa: bool = False,
100
+ ):
101
+ self.standard_toks = list(standard_toks)
102
+ self.prepend_toks = list(prepend_toks)
103
+ self.append_toks = list(append_toks)
104
+ self.prepend_bos = prepend_bos
105
+ self.append_eos = append_eos
106
+ self.use_msa = use_msa
107
+
108
+ self.all_toks = list(self.prepend_toks)
109
+ self.all_toks.extend(self.standard_toks)
110
+ for i in range((8 - (len(self.all_toks) % 8)) % 8):
111
+ self.all_toks.append(f"<null_{i + 1}>")
112
+ self.all_toks.extend(self.append_toks)
113
+
114
+ self.tok_to_idx = {tok: i for i, tok in enumerate(self.all_toks)}
115
+
116
+ self.unk_idx = self.tok_to_idx["<unk>"]
117
+ self.padding_idx = self.get_idx("<pad>")
118
+ self.cls_idx = self.get_idx("<cls>")
119
+ self.mask_idx = self.get_idx("<mask>")
120
+ self.eos_idx = self.get_idx("<eos>")
121
+ self.all_special_tokens = ['<eos>', '<unk>', '<pad>', '<cls>', '<mask>']
122
+ self.unique_no_split_tokens = self.all_toks
123
+
124
+ def __len__(self):
125
+ return len(self.all_toks)
126
+
127
+ def get_idx(self, tok):
128
+ return self.tok_to_idx.get(tok, self.unk_idx)
129
+
130
+ def get_tok(self, ind):
131
+ return self.all_toks[ind]
132
+
133
+ def to_dict(self):
134
+ return self.tok_to_idx.copy()
135
+
136
+ def get_batch_converter(self, truncation_seq_length: int = None):
137
+ if self.use_msa:
138
+ return MSABatchConverter(self, truncation_seq_length)
139
+ else:
140
+ return BatchConverter(self, truncation_seq_length)
141
+
142
+ @classmethod
143
+ def from_architecture(cls, name: str) -> "Alphabet":
144
+ if name in ("ESM-1", "protein_bert_base"):
145
+ standard_toks = proteinseq_toks["toks"]
146
+ prepend_toks: Tuple[str, ...] = ("<null_0>", "<pad>", "<eos>", "<unk>")
147
+ append_toks: Tuple[str, ...] = ("<cls>", "<mask>", "<sep>")
148
+ prepend_bos = True
149
+ append_eos = False
150
+ use_msa = False
151
+ elif name in ("ESM-1b", "roberta_large"):
152
+ standard_toks = proteinseq_toks["toks"]
153
+ prepend_toks = ("<cls>", "<pad>", "<eos>", "<unk>")
154
+ append_toks = ("<mask>",)
155
+ prepend_bos = True
156
+ append_eos = True
157
+ use_msa = False
158
+ elif name in ("MSA Transformer", "msa_transformer"):
159
+ standard_toks = proteinseq_toks["toks"]
160
+ prepend_toks = ("<cls>", "<pad>", "<eos>", "<unk>")
161
+ append_toks = ("<mask>",)
162
+ prepend_bos = True
163
+ append_eos = False
164
+ use_msa = True
165
+ elif "invariant_gvp" in name.lower():
166
+ standard_toks = proteinseq_toks["toks"]
167
+ prepend_toks = ("<null_0>", "<pad>", "<eos>", "<unk>")
168
+ append_toks = ("<mask>", "<cath>", "<af2>")
169
+ prepend_bos = True
170
+ append_eos = False
171
+ use_msa = False
172
+ else:
173
+ raise ValueError("Unknown architecture selected")
174
+ return cls(standard_toks, prepend_toks, append_toks, prepend_bos, append_eos, use_msa)
175
+
176
+ def _tokenize(self, text) -> str:
177
+ return text.split()
178
+
179
+ def tokenize(self, text, **kwargs) -> List[str]:
180
+ """
181
+ Inspired by https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py
182
+ Converts a string in a sequence of tokens, using the tokenizer.
183
+
184
+ Args:
185
+ text (:obj:`str`):
186
+ The sequence to be encoded.
187
+
188
+ Returns:
189
+ :obj:`List[str]`: The list of tokens.
190
+ """
191
+
192
+ def split_on_token(tok, text):
193
+ result = []
194
+ split_text = text.split(tok)
195
+ for i, sub_text in enumerate(split_text):
196
+ # AddedToken can control whitespace stripping around them.
197
+ # We use them for GPT2 and Roberta to have different behavior depending on the special token
198
+ # Cf. https://github.com/huggingface/transformers/pull/2778
199
+ # and https://github.com/huggingface/transformers/issues/3788
200
+ # We strip left and right by default
201
+ if i < len(split_text) - 1:
202
+ sub_text = sub_text.rstrip()
203
+ if i > 0:
204
+ sub_text = sub_text.lstrip()
205
+
206
+ if i == 0 and not sub_text:
207
+ result.append(tok)
208
+ elif i == len(split_text) - 1:
209
+ if sub_text:
210
+ result.append(sub_text)
211
+ else:
212
+ pass
213
+ else:
214
+ if sub_text:
215
+ result.append(sub_text)
216
+ result.append(tok)
217
+ return result
218
+
219
+ def split_on_tokens(tok_list, text):
220
+ if not text.strip():
221
+ return []
222
+
223
+ tokenized_text = []
224
+ text_list = [text]
225
+ for tok in tok_list:
226
+ tokenized_text = []
227
+ for sub_text in text_list:
228
+ if sub_text not in self.unique_no_split_tokens:
229
+ tokenized_text.extend(split_on_token(tok, sub_text))
230
+ else:
231
+ tokenized_text.append(sub_text)
232
+ text_list = tokenized_text
233
+
234
+ return list(
235
+ itertools.chain.from_iterable(
236
+ (
237
+ self._tokenize(token)
238
+ if token not in self.unique_no_split_tokens
239
+ else [token]
240
+ for token in tokenized_text
241
+ )
242
+ )
243
+ )
244
+
245
+ no_split_token = self.unique_no_split_tokens
246
+ tokenized_text = split_on_tokens(no_split_token, text)
247
+ return tokenized_text
248
+
249
+ def encode(self, text):
250
+ return [self.tok_to_idx[tok] for tok in self.tokenize(text)]
251
+
252
+
253
+ class BatchConverter(object):
254
+ """Callable to convert an unprocessed (labels + strings) batch to a
255
+ processed (labels + tensor) batch.
256
+ """
257
+
258
+ def __init__(self, alphabet, truncation_seq_length: int = None):
259
+ self.alphabet = alphabet
260
+ self.truncation_seq_length = truncation_seq_length
261
+
262
+ def __call__(self, raw_batch: Sequence[Tuple[str, str]]):
263
+ # RoBERTa uses an eos token, while ESM-1 does not.
264
+ batch_size = len(raw_batch)
265
+ batch_labels, seq_str_list = zip(*raw_batch)
266
+ seq_encoded_list = [self.alphabet.encode(seq_str) for seq_str in seq_str_list]
267
+ if self.truncation_seq_length:
268
+ seq_encoded_list = [seq_str[:self.truncation_seq_length] for seq_str in seq_encoded_list]
269
+ max_len = max(len(seq_encoded) for seq_encoded in seq_encoded_list)
270
+ tokens = torch.empty(
271
+ (
272
+ batch_size,
273
+ max_len + int(self.alphabet.prepend_bos) + int(self.alphabet.append_eos),
274
+ ),
275
+ dtype=torch.int64,
276
+ )
277
+ tokens.fill_(self.alphabet.padding_idx)
278
+ labels = []
279
+ strs = []
280
+
281
+ for i, (label, seq_str, seq_encoded) in enumerate(
282
+ zip(batch_labels, seq_str_list, seq_encoded_list)
283
+ ):
284
+ labels.append(label)
285
+ strs.append(seq_str)
286
+ if self.alphabet.prepend_bos:
287
+ tokens[i, 0] = self.alphabet.cls_idx
288
+ seq = torch.tensor(seq_encoded, dtype=torch.int64)
289
+ tokens[
290
+ i,
291
+ int(self.alphabet.prepend_bos) : len(seq_encoded)
292
+ + int(self.alphabet.prepend_bos),
293
+ ] = seq
294
+ if self.alphabet.append_eos:
295
+ tokens[i, len(seq_encoded) + int(self.alphabet.prepend_bos)] = self.alphabet.eos_idx
296
+
297
+ return labels, strs, tokens
298
+
299
+
300
+ class MSABatchConverter(BatchConverter):
301
+ def __call__(self, inputs: Union[Sequence[RawMSA], RawMSA]):
302
+ if isinstance(inputs[0][0], str):
303
+ # Input is a single MSA
304
+ raw_batch: Sequence[RawMSA] = [inputs] # type: ignore
305
+ else:
306
+ raw_batch = inputs # type: ignore
307
+
308
+ batch_size = len(raw_batch)
309
+ max_alignments = max(len(msa) for msa in raw_batch)
310
+ max_seqlen = max(len(msa[0][1]) for msa in raw_batch)
311
+
312
+ tokens = torch.empty(
313
+ (
314
+ batch_size,
315
+ max_alignments,
316
+ max_seqlen + int(self.alphabet.prepend_bos) + int(self.alphabet.append_eos),
317
+ ),
318
+ dtype=torch.int64,
319
+ )
320
+ tokens.fill_(self.alphabet.padding_idx)
321
+ labels = []
322
+ strs = []
323
+
324
+ for i, msa in enumerate(raw_batch):
325
+ msa_seqlens = set(len(seq) for _, seq in msa)
326
+ if not len(msa_seqlens) == 1:
327
+ raise RuntimeError(
328
+ "Received unaligned sequences for input to MSA, all sequence "
329
+ "lengths must be equal."
330
+ )
331
+ msa_labels, msa_strs, msa_tokens = super().__call__(msa)
332
+ labels.append(msa_labels)
333
+ strs.append(msa_strs)
334
+ tokens[i, : msa_tokens.size(0), : msa_tokens.size(1)] = msa_tokens
335
+
336
+ return labels, strs, tokens
337
+
338
+
339
+ def read_fasta(
340
+ path,
341
+ keep_gaps=True,
342
+ keep_insertions=True,
343
+ to_upper=False,
344
+ ):
345
+ with open(path, "r") as f:
346
+ for result in read_alignment_lines(
347
+ f, keep_gaps=keep_gaps, keep_insertions=keep_insertions, to_upper=to_upper
348
+ ):
349
+ yield result
350
+
351
+
352
+ def read_alignment_lines(
353
+ lines,
354
+ keep_gaps=True,
355
+ keep_insertions=True,
356
+ to_upper=False,
357
+ ):
358
+ seq = desc = None
359
+
360
+ def parse(s):
361
+ if not keep_gaps:
362
+ s = re.sub("-", "", s)
363
+ if not keep_insertions:
364
+ s = re.sub("[a-z]", "", s)
365
+ return s.upper() if to_upper else s
366
+
367
+ for line in lines:
368
+ # Line may be empty if seq % file_line_width == 0
369
+ if len(line) > 0 and line[0] == ">":
370
+ if seq is not None:
371
+ yield desc, parse(seq)
372
+ desc = line.strip().lstrip(">")
373
+ seq = ""
374
+ else:
375
+ assert isinstance(seq, str)
376
+ seq += line.strip()
377
+ assert isinstance(seq, str) and isinstance(desc, str)
378
+ yield desc, parse(seq)
379
+
380
+
381
+ class ESMStructuralSplitDataset(torch.utils.data.Dataset):
382
+ """
383
+ Structural Split Dataset as described in section A.10 of the supplement of our paper.
384
+ https://doi.org/10.1101/622803
385
+
386
+ We use the full version of SCOPe 2.07, clustered at 90% sequence identity,
387
+ generated on January 23, 2020.
388
+
389
+ For each SCOPe domain:
390
+ - We extract the sequence from the corresponding PDB file
391
+ - We extract the 3D coordinates of the Carbon beta atoms, aligning them
392
+ to the sequence. We put NaN where Cb atoms are missing.
393
+ - From the 3D coordinates, we calculate a pairwise distance map, based
394
+ on L2 distance
395
+ - We use DSSP to generate secondary structure labels for the corresponding
396
+ PDB file. This is also aligned to the sequence. We put - where SSP
397
+ labels are missing.
398
+
399
+ For each SCOPe classification level of family/superfamily/fold (in order of difficulty),
400
+ we have split the data into 5 partitions for cross validation. These are provided
401
+ in a downloaded splits folder, in the format:
402
+ splits/{split_level}/{cv_partition}/{train|valid}.txt
403
+ where train is the partition and valid is the concatentation of the remaining 4.
404
+
405
+ For each SCOPe domain, we provide a pkl dump that contains:
406
+ - seq : The domain sequence, stored as an L-length string
407
+ - ssp : The secondary structure labels, stored as an L-length string
408
+ - dist : The distance map, stored as an LxL numpy array
409
+ - coords : The 3D coordinates, stored as an Lx3 numpy array
410
+
411
+ """
412
+
413
+ base_folder = "structural-data"
414
+ file_list = [
415
+ # url tar filename filename MD5 Hash
416
+ (
417
+ "https://dl.fbaipublicfiles.com/fair-esm/structural-data/splits.tar.gz",
418
+ "splits.tar.gz",
419
+ "splits",
420
+ "456fe1c7f22c9d3d8dfe9735da52411d",
421
+ ),
422
+ (
423
+ "https://dl.fbaipublicfiles.com/fair-esm/structural-data/pkl.tar.gz",
424
+ "pkl.tar.gz",
425
+ "pkl",
426
+ "644ea91e56066c750cd50101d390f5db",
427
+ ),
428
+ ]
429
+
430
+ def __init__(
431
+ self,
432
+ split_level,
433
+ cv_partition,
434
+ split,
435
+ root_path=os.path.expanduser("~/.cache/torch/data/esm"),
436
+ download=False,
437
+ ):
438
+ super().__init__()
439
+ assert split in [
440
+ "train",
441
+ "valid",
442
+ ], "train_valid must be 'train' or 'valid'"
443
+ self.root_path = root_path
444
+ self.base_path = os.path.join(self.root_path, self.base_folder)
445
+
446
+ # check if root path has what you need or else download it
447
+ if download:
448
+ self.download()
449
+
450
+ self.split_file = os.path.join(
451
+ self.base_path, "splits", split_level, cv_partition, f"{split}.txt"
452
+ )
453
+ self.pkl_dir = os.path.join(self.base_path, "pkl")
454
+ self.names = []
455
+ with open(self.split_file) as f:
456
+ self.names = f.read().splitlines()
457
+
458
+ def __len__(self):
459
+ return len(self.names)
460
+
461
+ def _check_exists(self) -> bool:
462
+ for (_, _, filename, _) in self.file_list:
463
+ fpath = os.path.join(self.base_path, filename)
464
+ if not os.path.exists(fpath) or not os.path.isdir(fpath):
465
+ return False
466
+ return True
467
+
468
+ def download(self):
469
+
470
+ if self._check_exists():
471
+ print("Files already downloaded and verified")
472
+ return
473
+
474
+ from torchvision.datasets.utils import download_url
475
+
476
+ for url, tar_filename, filename, md5_hash in self.file_list:
477
+ download_path = os.path.join(self.base_path, tar_filename)
478
+ download_url(url=url, root=self.base_path, filename=tar_filename, md5=md5_hash)
479
+ shutil.unpack_archive(download_path, self.base_path)
480
+
481
+ def __getitem__(self, idx):
482
+ """
483
+ Returns a dict with the following entires
484
+ - seq : Str (domain sequence)
485
+ - ssp : Str (SSP labels)
486
+ - dist : np.array (distance map)
487
+ - coords : np.array (3D coordinates)
488
+ """
489
+ name = self.names[idx]
490
+ pkl_fname = os.path.join(self.pkl_dir, name[1:3], f"{name}.pkl")
491
+ with open(pkl_fname, "rb") as f:
492
+ obj = pickle.load(f)
493
+ return obj
esm/source/esm/esmfold/v1/__init__.py ADDED
File without changes
esm/source/esm/esmfold/v1/categorical_mixture.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+ import torch
6
+
7
+
8
+ class CategoricalMixture:
9
+ def __init__(self, param, bins=50, start=0, end=1):
10
+ # All tensors are of shape ..., bins.
11
+ self.logits = param
12
+ bins = torch.linspace(
13
+ start, end, bins + 1, device=self.logits.device, dtype=self.logits.dtype
14
+ )
15
+ self.v_bins = (bins[:-1] + bins[1:]) / 2
16
+
17
+ def log_prob(self, true):
18
+ # Shapes are:
19
+ # self.probs: ... x bins
20
+ # true : ...
21
+ true_index = (
22
+ (
23
+ true.unsqueeze(-1)
24
+ - self.v_bins[
25
+ [
26
+ None,
27
+ ]
28
+ * true.ndim
29
+ ]
30
+ )
31
+ .abs()
32
+ .argmin(-1)
33
+ )
34
+ nll = self.logits.log_softmax(-1)
35
+ return torch.take_along_dim(nll, true_index.unsqueeze(-1), dim=-1).squeeze(-1)
36
+
37
+ def mean(self):
38
+ return (self.logits.softmax(-1) @ self.v_bins.unsqueeze(1)).squeeze(-1)
39
+
40
+
41
+ def categorical_lddt(logits, bins=50):
42
+ # Logits are ..., 37, bins.
43
+ return CategoricalMixture(logits, bins=bins).mean()
esm/source/esm/esmfold/v1/esmfold.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+ import typing as T
6
+ from dataclasses import dataclass
7
+ from functools import partial
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ from torch import nn
12
+ from torch.nn import LayerNorm
13
+
14
+ import esm
15
+ from esm import Alphabet
16
+ from esm.esmfold.v1.categorical_mixture import categorical_lddt
17
+ from esm.esmfold.v1.misc import (
18
+ batch_encode_sequences,
19
+ collate_dense_tensors,
20
+ output_to_pdb,
21
+ )
22
+ from esm.esmfold.v1.trunk import FoldingTrunk, FoldingTrunkConfig
23
+ from openfold.data.data_transforms import make_atom14_masks
24
+ from openfold.np import residue_constants
25
+ from openfold.utils.loss import compute_predicted_aligned_error, compute_tm
26
+
27
+
28
+ @dataclass
29
+ class ESMFoldConfig:
30
+ trunk: T.Any = FoldingTrunkConfig()
31
+ lddt_head_hid_dim: int = 128
32
+
33
+
34
+ load_fn = esm.pretrained.load_model_and_alphabet
35
+ esm_registry = {
36
+ "esm2_8M": partial(load_fn, "esm2_t6_8M_UR50D_500K"),
37
+ "esm2_8M_270K": esm.pretrained.esm2_t6_8M_UR50D,
38
+ "esm2_35M": partial(load_fn, "esm2_t12_35M_UR50D_500K"),
39
+ "esm2_35M_270K": esm.pretrained.esm2_t12_35M_UR50D,
40
+ "esm2_150M": partial(load_fn, "esm2_t30_150M_UR50D_500K"),
41
+ "esm2_150M_270K": partial(load_fn, "esm2_t30_150M_UR50D_270K"),
42
+ "esm2_650M": esm.pretrained.esm2_t33_650M_UR50D,
43
+ "esm2_650M_270K": partial(load_fn, "esm2_t33_650M_270K_UR50D"),
44
+ "esm2_3B": esm.pretrained.esm2_t36_3B_UR50D,
45
+ "esm2_3B_270K": partial(load_fn, "esm2_t36_3B_UR50D_500K"),
46
+ "esm2_15B": esm.pretrained.esm2_t48_15B_UR50D,
47
+ }
48
+
49
+
50
+ class ESMFold(nn.Module):
51
+ def __init__(self, esmfold_config=None, **kwargs):
52
+ super().__init__()
53
+
54
+ self.cfg = esmfold_config if esmfold_config else ESMFoldConfig(**kwargs)
55
+ cfg = self.cfg
56
+
57
+ self.distogram_bins = 64
58
+
59
+ self.esm, self.esm_dict = esm_registry.get(cfg.esm_type)()
60
+
61
+ self.esm.requires_grad_(False)
62
+ self.esm.half()
63
+
64
+ self.esm_feats = self.esm.embed_dim
65
+ self.esm_attns = self.esm.num_layers * self.esm.attention_heads
66
+ self.register_buffer("af2_to_esm", ESMFold._af2_to_esm(self.esm_dict))
67
+ self.esm_s_combine = nn.Parameter(torch.zeros(self.esm.num_layers + 1))
68
+
69
+ c_s = cfg.trunk.sequence_state_dim
70
+ c_z = cfg.trunk.pairwise_state_dim
71
+
72
+ self.esm_s_mlp = nn.Sequential(
73
+ LayerNorm(self.esm_feats),
74
+ nn.Linear(self.esm_feats, c_s),
75
+ nn.ReLU(),
76
+ nn.Linear(c_s, c_s),
77
+ )
78
+ if cfg.use_esm_attn_map:
79
+ self.esm_z_mlp = nn.Sequential(
80
+ LayerNorm(self.esm_attns),
81
+ nn.Linear(self.esm_attns, c_z),
82
+ nn.ReLU(),
83
+ nn.Linear(c_z, c_z),
84
+ )
85
+
86
+ # 0 is padding, N is unknown residues, N + 1 is mask.
87
+ self.n_tokens_embed = residue_constants.restype_num + 3
88
+ self.pad_idx = 0
89
+ self.unk_idx = self.n_tokens_embed - 2
90
+ self.mask_idx = self.n_tokens_embed - 1
91
+ self.embedding = nn.Embedding(self.n_tokens_embed, c_s, padding_idx=0)
92
+
93
+ self.trunk = FoldingTrunk(**cfg.trunk)
94
+
95
+ self.distogram_head = nn.Linear(c_z, self.distogram_bins)
96
+ self.ptm_head = nn.Linear(c_z, self.distogram_bins)
97
+ self.lm_head = nn.Linear(c_s, self.n_tokens_embed)
98
+ self.lddt_bins = 50
99
+ self.lddt_head = nn.Sequential(
100
+ nn.LayerNorm(cfg.trunk.structure_module.c_s),
101
+ nn.Linear(cfg.trunk.structure_module.c_s, cfg.lddt_head_hid_dim),
102
+ nn.Linear(cfg.lddt_head_hid_dim, cfg.lddt_head_hid_dim),
103
+ nn.Linear(cfg.lddt_head_hid_dim, 37 * self.lddt_bins),
104
+ )
105
+
106
+ @staticmethod
107
+ def _af2_to_esm(d: Alphabet):
108
+ # Remember that t is shifted from residue_constants by 1 (0 is padding).
109
+ esm_reorder = [d.padding_idx] + [
110
+ d.get_idx(v) for v in residue_constants.restypes_with_x
111
+ ]
112
+ return torch.tensor(esm_reorder)
113
+
114
+ def _af2_idx_to_esm_idx(self, aa, mask):
115
+ aa = (aa + 1).masked_fill(mask != 1, 0)
116
+ return self.af2_to_esm[aa]
117
+
118
+ def _compute_language_model_representations(
119
+ self, esmaa: torch.Tensor
120
+ ) -> torch.Tensor:
121
+ """Adds bos/eos tokens for the language model, since the structure module doesn't use these."""
122
+ batch_size = esmaa.size(0)
123
+
124
+ bosi, eosi = self.esm_dict.cls_idx, self.esm_dict.eos_idx
125
+ bos = esmaa.new_full((batch_size, 1), bosi)
126
+ eos = esmaa.new_full((batch_size, 1), self.esm_dict.padding_idx)
127
+ esmaa = torch.cat([bos, esmaa, eos], dim=1)
128
+ # Use the first padding index as eos during inference.
129
+ esmaa[range(batch_size), (esmaa != 1).sum(1)] = eosi
130
+
131
+ res = self.esm(
132
+ esmaa,
133
+ repr_layers=range(self.esm.num_layers + 1),
134
+ need_head_weights=self.cfg.use_esm_attn_map,
135
+ )
136
+ esm_s = torch.stack(
137
+ [v for _, v in sorted(res["representations"].items())], dim=2
138
+ )
139
+ esm_s = esm_s[:, 1:-1] # B, L, nLayers, C
140
+ esm_z = (
141
+ res["attentions"].permute(0, 4, 3, 1, 2).flatten(3, 4)[:, 1:-1, 1:-1, :]
142
+ if self.cfg.use_esm_attn_map
143
+ else None
144
+ )
145
+ return esm_s, esm_z
146
+
147
+ def _mask_inputs_to_esm(self, esmaa, pattern):
148
+ new_esmaa = esmaa.clone()
149
+ new_esmaa[pattern == 1] = self.esm_dict.mask_idx
150
+ return new_esmaa
151
+
152
+ def forward(
153
+ self,
154
+ aa: torch.Tensor,
155
+ mask: T.Optional[torch.Tensor] = None,
156
+ residx: T.Optional[torch.Tensor] = None,
157
+ masking_pattern: T.Optional[torch.Tensor] = None,
158
+ num_recycles: T.Optional[int] = None,
159
+ ):
160
+ """Runs a forward pass given input tokens. Use `model.infer` to
161
+ run inference from a sequence.
162
+
163
+ Args:
164
+ aa (torch.Tensor): Tensor containing indices corresponding to amino acids. Indices match
165
+ openfold.np.residue_constants.restype_order_with_x.
166
+ mask (torch.Tensor): Binary tensor with 1 meaning position is unmasked and 0 meaning position is masked.
167
+ residx (torch.Tensor): Residue indices of amino acids. Will assume contiguous if not provided.
168
+ masking_pattern (torch.Tensor): Optional masking to pass to the input. Binary tensor of the same size
169
+ as `aa`. Positions with 1 will be masked. ESMFold sometimes produces different samples when
170
+ different masks are provided.
171
+ num_recycles (int): How many recycle iterations to perform. If None, defaults to training max
172
+ recycles, which is 3.
173
+ """
174
+
175
+ if mask is None:
176
+ mask = torch.ones_like(aa)
177
+
178
+ B = aa.shape[0]
179
+ L = aa.shape[1]
180
+ device = aa.device
181
+
182
+ if residx is None:
183
+ residx = torch.arange(L, device=device).expand_as(aa)
184
+
185
+ # === ESM ===
186
+ esmaa = self._af2_idx_to_esm_idx(aa, mask)
187
+
188
+ if masking_pattern is not None:
189
+ esmaa = self._mask_inputs_to_esm(esmaa, masking_pattern)
190
+
191
+ esm_s, esm_z = self._compute_language_model_representations(esmaa)
192
+
193
+ # Convert esm_s to the precision used by the trunk and
194
+ # the structure module. These tensors may be a lower precision if, for example,
195
+ # we're running the language model in fp16 precision.
196
+ esm_s = esm_s.to(self.esm_s_combine.dtype)
197
+ esm_s = esm_s.detach()
198
+
199
+ # === preprocessing ===
200
+ esm_s = (self.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s).squeeze(2)
201
+
202
+ s_s_0 = self.esm_s_mlp(esm_s)
203
+ if self.cfg.use_esm_attn_map:
204
+ esm_z = esm_z.to(self.esm_s_combine.dtype)
205
+ esm_z = esm_z.detach()
206
+ s_z_0 = self.esm_z_mlp(esm_z)
207
+ else:
208
+ s_z_0 = s_s_0.new_zeros(B, L, L, self.cfg.trunk.pairwise_state_dim)
209
+
210
+ s_s_0 += self.embedding(aa)
211
+
212
+ structure: dict = self.trunk(
213
+ s_s_0, s_z_0, aa, residx, mask, no_recycles=num_recycles
214
+ )
215
+ # Documenting what we expect:
216
+ structure = {
217
+ k: v
218
+ for k, v in structure.items()
219
+ if k
220
+ in [
221
+ "s_z",
222
+ "s_s",
223
+ "frames",
224
+ "sidechain_frames",
225
+ "unnormalized_angles",
226
+ "angles",
227
+ "positions",
228
+ "states",
229
+ ]
230
+ }
231
+
232
+ disto_logits = self.distogram_head(structure["s_z"])
233
+ disto_logits = (disto_logits + disto_logits.transpose(1, 2)) / 2
234
+ structure["distogram_logits"] = disto_logits
235
+
236
+ lm_logits = self.lm_head(structure["s_s"])
237
+ structure["lm_logits"] = lm_logits
238
+
239
+ structure["aatype"] = aa
240
+ make_atom14_masks(structure)
241
+
242
+ for k in [
243
+ "atom14_atom_exists",
244
+ "atom37_atom_exists",
245
+ ]:
246
+ structure[k] *= mask.unsqueeze(-1)
247
+ structure["residue_index"] = residx
248
+
249
+ lddt_head = self.lddt_head(structure["states"]).reshape(
250
+ structure["states"].shape[0], B, L, -1, self.lddt_bins
251
+ )
252
+ structure["lddt_head"] = lddt_head
253
+ plddt = categorical_lddt(lddt_head[-1], bins=self.lddt_bins)
254
+ structure["plddt"] = (
255
+ 100 * plddt
256
+ ) # we predict plDDT between 0 and 1, scale to be between 0 and 100.
257
+
258
+ ptm_logits = self.ptm_head(structure["s_z"])
259
+
260
+ seqlen = mask.type(torch.int64).sum(1)
261
+ structure["ptm_logits"] = ptm_logits
262
+ structure["ptm"] = torch.stack(
263
+ [
264
+ compute_tm(
265
+ batch_ptm_logits[None, :sl, :sl],
266
+ max_bins=31,
267
+ no_bins=self.distogram_bins,
268
+ )
269
+ for batch_ptm_logits, sl in zip(ptm_logits, seqlen)
270
+ ]
271
+ )
272
+ structure.update(
273
+ compute_predicted_aligned_error(
274
+ ptm_logits, max_bin=31, no_bins=self.distogram_bins
275
+ )
276
+ )
277
+
278
+ return structure
279
+
280
+ @torch.no_grad()
281
+ def infer(
282
+ self,
283
+ sequences: T.Union[str, T.List[str]],
284
+ residx=None,
285
+ masking_pattern: T.Optional[torch.Tensor] = None,
286
+ num_recycles: T.Optional[int] = None,
287
+ residue_index_offset: T.Optional[int] = 512,
288
+ chain_linker: T.Optional[str] = "G" * 25,
289
+ ):
290
+ """Runs a forward pass given input sequences.
291
+
292
+ Args:
293
+ sequences (Union[str, List[str]]): A list of sequences to make predictions for. Multimers can also be passed in,
294
+ each chain should be separated by a ':' token (e.g. "<chain1>:<chain2>:<chain3>").
295
+ residx (torch.Tensor): Residue indices of amino acids. Will assume contiguous if not provided.
296
+ masking_pattern (torch.Tensor): Optional masking to pass to the input. Binary tensor of the same size
297
+ as `aa`. Positions with 1 will be masked. ESMFold sometimes produces different samples when
298
+ different masks are provided.
299
+ num_recycles (int): How many recycle iterations to perform. If None, defaults to training max
300
+ recycles (cfg.trunk.max_recycles), which is 4.
301
+ residue_index_offset (int): Residue index separation between chains if predicting a multimer. Has no effect on
302
+ single chain predictions. Default: 512.
303
+ chain_linker (str): Linker to use between chains if predicting a multimer. Has no effect on single chain
304
+ predictions. Default: length-25 poly-G ("G" * 25).
305
+ """
306
+ if isinstance(sequences, str):
307
+ sequences = [sequences]
308
+
309
+ aatype, mask, _residx, linker_mask, chain_index = batch_encode_sequences(
310
+ sequences, residue_index_offset, chain_linker
311
+ )
312
+
313
+ if residx is None:
314
+ residx = _residx
315
+ elif not isinstance(residx, torch.Tensor):
316
+ residx = collate_dense_tensors(residx)
317
+
318
+ aatype, mask, residx, linker_mask = map(
319
+ lambda x: x.to(self.device), (aatype, mask, residx, linker_mask)
320
+ )
321
+
322
+ output = self.forward(
323
+ aatype,
324
+ mask=mask,
325
+ residx=residx,
326
+ masking_pattern=masking_pattern,
327
+ num_recycles=num_recycles,
328
+ )
329
+
330
+ output["atom37_atom_exists"] = output[
331
+ "atom37_atom_exists"
332
+ ] * linker_mask.unsqueeze(2)
333
+
334
+ output["mean_plddt"] = (output["plddt"] * output["atom37_atom_exists"]).sum(
335
+ dim=(1, 2)
336
+ ) / output["atom37_atom_exists"].sum(dim=(1, 2))
337
+ output["chain_index"] = chain_index
338
+
339
+ return output
340
+
341
+ def output_to_pdb(self, output: T.Dict) -> T.List[str]:
342
+ """Returns the pbd (file) string from the model given the model output."""
343
+ return output_to_pdb(output)
344
+
345
+ def infer_pdbs(self, seqs: T.List[str], *args, **kwargs) -> T.List[str]:
346
+ """Returns list of pdb (files) strings from the model given a list of input sequences."""
347
+ output = self.infer(seqs, *args, **kwargs)
348
+ return self.output_to_pdb(output)
349
+
350
+ def infer_pdb(self, sequence: str, *args, **kwargs) -> str:
351
+ """Returns the pdb (file) string from the model given an input sequence."""
352
+ return self.infer_pdbs([sequence], *args, **kwargs)[0]
353
+
354
+ def set_chunk_size(self, chunk_size: T.Optional[int]):
355
+ # This parameter means the axial attention will be computed
356
+ # in a chunked manner. This should make the memory used more or less O(L) instead of O(L^2).
357
+ # It's equivalent to running a for loop over chunks of the dimension we're iterative over,
358
+ # where the chunk_size is the size of the chunks, so 128 would mean to parse 128-lengthed chunks.
359
+ # Setting the value to None will return to default behavior, disable chunking.
360
+ self.trunk.set_chunk_size(chunk_size)
361
+
362
+ @property
363
+ def device(self):
364
+ return self.esm_s_combine.device
esm/source/esm/esmfold/v1/misc.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+ import typing as T
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from einops import rearrange, repeat
11
+ from torch import nn
12
+ from openfold.np import residue_constants
13
+ from openfold.np.protein import Protein as OFProtein
14
+ from openfold.np.protein import to_pdb
15
+ from openfold.utils.feats import atom14_to_atom37
16
+
17
+
18
+ def encode_sequence(
19
+ seq: str,
20
+ residue_index_offset: T.Optional[int] = 512,
21
+ chain_linker: T.Optional[str] = "G" * 25,
22
+ ) -> T.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
23
+ if chain_linker is None:
24
+ chain_linker = ""
25
+ if residue_index_offset is None:
26
+ residue_index_offset = 0
27
+
28
+ chains = seq.split(":")
29
+ seq = chain_linker.join(chains)
30
+
31
+ unk_idx = residue_constants.restype_order_with_x["X"]
32
+ encoded = torch.tensor(
33
+ [residue_constants.restype_order_with_x.get(aa, unk_idx) for aa in seq]
34
+ )
35
+ residx = torch.arange(len(encoded))
36
+
37
+ if residue_index_offset > 0:
38
+ start = 0
39
+ for i, chain in enumerate(chains):
40
+ residx[start : start + len(chain) + len(chain_linker)] += (
41
+ i * residue_index_offset
42
+ )
43
+ start += len(chain) + len(chain_linker)
44
+
45
+ linker_mask = torch.ones_like(encoded, dtype=torch.float32)
46
+ chain_index = []
47
+ offset = 0
48
+ for i, chain in enumerate(chains):
49
+ if i > 0:
50
+ chain_index.extend([i - 1] * len(chain_linker))
51
+ chain_index.extend([i] * len(chain))
52
+ offset += len(chain)
53
+ linker_mask[offset : offset + len(chain_linker)] = 0
54
+ offset += len(chain_linker)
55
+
56
+ chain_index = torch.tensor(chain_index, dtype=torch.int64)
57
+
58
+ return encoded, residx, linker_mask, chain_index
59
+
60
+
61
+ def batch_encode_sequences(
62
+ sequences: T.Sequence[str],
63
+ residue_index_offset: T.Optional[int] = 512,
64
+ chain_linker: T.Optional[str] = "G" * 25,
65
+ ) -> T.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
66
+
67
+ aatype_list = []
68
+ residx_list = []
69
+ linker_mask_list = []
70
+ chain_index_list = []
71
+ for seq in sequences:
72
+ aatype_seq, residx_seq, linker_mask_seq, chain_index_seq = encode_sequence(
73
+ seq,
74
+ residue_index_offset=residue_index_offset,
75
+ chain_linker=chain_linker,
76
+ )
77
+ aatype_list.append(aatype_seq)
78
+ residx_list.append(residx_seq)
79
+ linker_mask_list.append(linker_mask_seq)
80
+ chain_index_list.append(chain_index_seq)
81
+
82
+ aatype = collate_dense_tensors(aatype_list)
83
+ mask = collate_dense_tensors(
84
+ [aatype.new_ones(len(aatype_seq)) for aatype_seq in aatype_list]
85
+ )
86
+ residx = collate_dense_tensors(residx_list)
87
+ linker_mask = collate_dense_tensors(linker_mask_list)
88
+ chain_index_list = collate_dense_tensors(chain_index_list, -1)
89
+
90
+ return aatype, mask, residx, linker_mask, chain_index_list
91
+
92
+
93
+ def output_to_pdb(output: T.Dict) -> T.List[str]:
94
+ """Returns the pbd (file) string from the model given the model output."""
95
+ # atom14_to_atom37 must be called first, as it fails on latest numpy if the
96
+ # input is a numpy array. It will work if the input is a torch tensor.
97
+ final_atom_positions = atom14_to_atom37(output["positions"][-1], output)
98
+ output = {k: v.to("cpu").numpy() for k, v in output.items()}
99
+ final_atom_positions = final_atom_positions.cpu().numpy()
100
+ final_atom_mask = output["atom37_atom_exists"]
101
+ pdbs = []
102
+ for i in range(output["aatype"].shape[0]):
103
+ aa = output["aatype"][i]
104
+ pred_pos = final_atom_positions[i]
105
+ mask = final_atom_mask[i]
106
+ resid = output["residue_index"][i] + 1
107
+ pred = OFProtein(
108
+ aatype=aa,
109
+ atom_positions=pred_pos,
110
+ atom_mask=mask,
111
+ residue_index=resid,
112
+ b_factors=output["plddt"][i],
113
+ chain_index=output["chain_index"][i] if "chain_index" in output else None,
114
+ )
115
+ pdbs.append(to_pdb(pred))
116
+ return pdbs
117
+
118
+
119
+ def collate_dense_tensors(
120
+ samples: T.List[torch.Tensor], pad_v: float = 0
121
+ ) -> torch.Tensor:
122
+ """
123
+ Takes a list of tensors with the following dimensions:
124
+ [(d_11, ..., d_1K),
125
+ (d_21, ..., d_2K),
126
+ ...,
127
+ (d_N1, ..., d_NK)]
128
+ and stack + pads them into a single tensor of:
129
+ (N, max_i=1,N { d_i1 }, ..., max_i=1,N {diK})
130
+ """
131
+ if len(samples) == 0:
132
+ return torch.Tensor()
133
+ if len(set(x.dim() for x in samples)) != 1:
134
+ raise RuntimeError(
135
+ f"Samples has varying dimensions: {[x.dim() for x in samples]}"
136
+ )
137
+ (device,) = tuple(set(x.device for x in samples)) # assumes all on same device
138
+ max_shape = [max(lst) for lst in zip(*[x.shape for x in samples])]
139
+ result = torch.empty(
140
+ len(samples), *max_shape, dtype=samples[0].dtype, device=device
141
+ )
142
+ result.fill_(pad_v)
143
+ for i in range(len(samples)):
144
+ result_i = result[i]
145
+ t = samples[i]
146
+ result_i[tuple(slice(0, k) for k in t.shape)] = t
147
+ return result
148
+
149
+
150
+ class Attention(nn.Module):
151
+ def __init__(self, embed_dim, num_heads, head_width, gated=False):
152
+ super().__init__()
153
+ assert embed_dim == num_heads * head_width
154
+
155
+ self.embed_dim = embed_dim
156
+ self.num_heads = num_heads
157
+ self.head_width = head_width
158
+
159
+ self.proj = nn.Linear(embed_dim, embed_dim * 3, bias=False)
160
+ self.o_proj = nn.Linear(embed_dim, embed_dim, bias=True)
161
+ self.gated = gated
162
+ if gated:
163
+ self.g_proj = nn.Linear(embed_dim, embed_dim)
164
+ torch.nn.init.zeros_(self.g_proj.weight)
165
+ torch.nn.init.ones_(self.g_proj.bias)
166
+
167
+ self.rescale_factor = self.head_width**-0.5
168
+
169
+ torch.nn.init.zeros_(self.o_proj.bias)
170
+
171
+ def forward(self, x, mask=None, bias=None, indices=None):
172
+ """
173
+ Basic self attention with optional mask and external pairwise bias.
174
+ To handle sequences of different lengths, use mask.
175
+
176
+ Inputs:
177
+ x: batch of input sequneces (.. x L x C)
178
+ mask: batch of boolean masks where 1=valid, 0=padding position (.. x L_k). optional.
179
+ bias: batch of scalar pairwise attention biases (.. x Lq x Lk x num_heads). optional.
180
+
181
+ Outputs:
182
+ sequence projection (B x L x embed_dim), attention maps (B x L x L x num_heads)
183
+ """
184
+
185
+ t = rearrange(self.proj(x), "... l (h c) -> ... h l c", h=self.num_heads)
186
+ q, k, v = t.chunk(3, dim=-1)
187
+
188
+ q = self.rescale_factor * q
189
+ a = torch.einsum("...qc,...kc->...qk", q, k)
190
+
191
+ # Add external attention bias.
192
+ if bias is not None:
193
+ a = a + rearrange(bias, "... lq lk h -> ... h lq lk")
194
+
195
+ # Do not attend to padding tokens.
196
+ if mask is not None:
197
+ mask = repeat(
198
+ mask, "... lk -> ... h lq lk", h=self.num_heads, lq=q.shape[-2]
199
+ )
200
+ a = a.masked_fill(mask == False, -np.inf)
201
+
202
+ a = F.softmax(a, dim=-1)
203
+
204
+ y = torch.einsum("...hqk,...hkc->...qhc", a, v)
205
+ y = rearrange(y, "... h c -> ... (h c)", h=self.num_heads)
206
+
207
+ if self.gated:
208
+ y = self.g_proj(x).sigmoid() * y
209
+ y = self.o_proj(y)
210
+
211
+ return y, rearrange(a, "... lq lk h -> ... h lq lk")
212
+
213
+
214
+ class Dropout(nn.Module):
215
+ """
216
+ Implementation of dropout with the ability to share the dropout mask
217
+ along a particular dimension.
218
+ """
219
+
220
+ def __init__(self, r: float, batch_dim: T.Union[int, T.List[int]]):
221
+ super(Dropout, self).__init__()
222
+
223
+ self.r = r
224
+ if type(batch_dim) == int:
225
+ batch_dim = [batch_dim]
226
+ self.batch_dim = batch_dim
227
+ self.dropout = nn.Dropout(self.r)
228
+
229
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
230
+ shape = list(x.shape)
231
+ if self.batch_dim is not None:
232
+ for bd in self.batch_dim:
233
+ shape[bd] = 1
234
+ return x * self.dropout(x.new_ones(shape))
235
+
236
+
237
+ class SequenceToPair(nn.Module):
238
+ def __init__(self, sequence_state_dim, inner_dim, pairwise_state_dim):
239
+ super().__init__()
240
+
241
+ self.layernorm = nn.LayerNorm(sequence_state_dim)
242
+ self.proj = nn.Linear(sequence_state_dim, inner_dim * 2, bias=True)
243
+ self.o_proj = nn.Linear(2 * inner_dim, pairwise_state_dim, bias=True)
244
+
245
+ torch.nn.init.zeros_(self.proj.bias)
246
+ torch.nn.init.zeros_(self.o_proj.bias)
247
+
248
+ def forward(self, sequence_state):
249
+ """
250
+ Inputs:
251
+ sequence_state: B x L x sequence_state_dim
252
+
253
+ Output:
254
+ pairwise_state: B x L x L x pairwise_state_dim
255
+
256
+ Intermediate state:
257
+ B x L x L x 2*inner_dim
258
+ """
259
+
260
+ assert len(sequence_state.shape) == 3
261
+
262
+ s = self.layernorm(sequence_state)
263
+ s = self.proj(s)
264
+ q, k = s.chunk(2, dim=-1)
265
+
266
+ prod = q[:, None, :, :] * k[:, :, None, :]
267
+ diff = q[:, None, :, :] - k[:, :, None, :]
268
+
269
+ x = torch.cat([prod, diff], dim=-1)
270
+ x = self.o_proj(x)
271
+
272
+ return x
273
+
274
+
275
+ class PairToSequence(nn.Module):
276
+ def __init__(self, pairwise_state_dim, num_heads):
277
+ super().__init__()
278
+
279
+ self.layernorm = nn.LayerNorm(pairwise_state_dim)
280
+ self.linear = nn.Linear(pairwise_state_dim, num_heads, bias=False)
281
+
282
+ def forward(self, pairwise_state):
283
+ """
284
+ Inputs:
285
+ pairwise_state: B x L x L x pairwise_state_dim
286
+
287
+ Output:
288
+ pairwise_bias: B x L x L x num_heads
289
+ """
290
+ assert len(pairwise_state.shape) == 4
291
+ z = self.layernorm(pairwise_state)
292
+ pairwise_bias = self.linear(z)
293
+ return pairwise_bias
294
+
295
+
296
+ class ResidueMLP(nn.Module):
297
+ def __init__(self, embed_dim, inner_dim, norm=nn.LayerNorm, dropout=0):
298
+ super().__init__()
299
+
300
+ self.mlp = nn.Sequential(
301
+ norm(embed_dim),
302
+ nn.Linear(embed_dim, inner_dim),
303
+ nn.ReLU(),
304
+ nn.Linear(inner_dim, embed_dim),
305
+ nn.Dropout(dropout),
306
+ )
307
+
308
+ def forward(self, x):
309
+ return x + self.mlp(x)
esm/source/esm/esmfold/v1/pretrained.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from pathlib import Path
7
+
8
+ import torch
9
+
10
+ from esm.esmfold.v1.esmfold import ESMFold
11
+
12
+
13
+ def _load_model(model_name):
14
+ if model_name.endswith(".pt"): # local, treat as filepath
15
+ model_path = Path(model_name)
16
+ model_data = torch.load(str(model_path), map_location="cpu")
17
+ else: # load from hub
18
+ url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
19
+ model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location="cpu")
20
+
21
+ cfg = model_data["cfg"]["model"]
22
+ model_state = model_data["model"]
23
+ model = ESMFold(esmfold_config=cfg)
24
+
25
+ expected_keys = set(model.state_dict().keys())
26
+ found_keys = set(model_state.keys())
27
+
28
+ missing_essential_keys = []
29
+ for missing_key in expected_keys - found_keys:
30
+ if not missing_key.startswith("esm."):
31
+ missing_essential_keys.append(missing_key)
32
+
33
+ if missing_essential_keys:
34
+ raise RuntimeError(f"Keys '{', '.join(missing_essential_keys)}' are missing.")
35
+
36
+ model.load_state_dict(model_state, strict=False)
37
+
38
+ return model
39
+
40
+
41
+ def esmfold_v0():
42
+ """
43
+ ESMFold v0 model with 3B ESM-2, 48 folding blocks.
44
+ This version was used for the paper (Lin et al, 2022). It was trained
45
+ on all PDB chains until 2020-05, to ensure temporal holdout with CASP14
46
+ and the CAMEO validation and test set reported there.
47
+ """
48
+ return _load_model("esmfold_3B_v0")
49
+
50
+
51
+ def esmfold_v1():
52
+ """
53
+ ESMFold v1 model using 3B ESM-2, 48 folding blocks.
54
+ ESMFold provides fast high accuracy atomic level structure prediction
55
+ directly from the individual sequence of a protein. ESMFold uses the ESM2
56
+ protein language model to extract meaningful representations from the
57
+ protein sequence.
58
+ """
59
+ return _load_model("esmfold_3B_v1")
60
+
61
+
62
+ def esmfold_structure_module_only_8M():
63
+ """
64
+ ESMFold baseline model using 8M ESM-2, 0 folding blocks.
65
+ ESM-2 here is trained out to 500K updates.
66
+ This is a model designed to test the capabilities of the language model
67
+ when ablated for number of parameters in the language model.
68
+ See table S1 in (Lin et al, 2022).
69
+ """
70
+ return _load_model("esmfold_structure_module_only_8M")
71
+
72
+
73
+ def esmfold_structure_module_only_8M_270K():
74
+ """
75
+ ESMFold baseline model using 8M ESM-2, 0 folding blocks.
76
+ ESM-2 here is trained out to 270K updates.
77
+ This is a model designed to test the capabilities of the language model
78
+ when ablated for number of parameters in the language model.
79
+ See table S1 in (Lin et al, 2022).
80
+ """
81
+ return _load_model("esmfold_structure_module_only_8M_270K")
82
+
83
+
84
+ def esmfold_structure_module_only_35M():
85
+ """
86
+ ESMFold baseline model using 35M ESM-2, 0 folding blocks.
87
+ ESM-2 here is trained out to 500K updates.
88
+ This is a model designed to test the capabilities of the language model
89
+ when ablated for number of parameters in the language model.
90
+ See table S1 in (Lin et al, 2022).
91
+ """
92
+ return _load_model("esmfold_structure_module_only_35M")
93
+
94
+
95
+ def esmfold_structure_module_only_35M_270K():
96
+ """
97
+ ESMFold baseline model using 35M ESM-2, 0 folding blocks.
98
+ ESM-2 here is trained out to 270K updates.
99
+ This is a model designed to test the capabilities of the language model
100
+ when ablated for number of parameters in the language model.
101
+ See table S1 in (Lin et al, 2022).
102
+ """
103
+ return _load_model("esmfold_structure_module_only_35M_270K")
104
+
105
+
106
+ def esmfold_structure_module_only_150M():
107
+ """
108
+ ESMFold baseline model using 150M ESM-2, 0 folding blocks.
109
+ ESM-2 here is trained out to 500K updates.
110
+ This is a model designed to test the capabilities of the language model
111
+ when ablated for number of parameters in the language model.
112
+ See table S1 in (Lin et al, 2022).
113
+ """
114
+ return _load_model("esmfold_structure_module_only_150M")
115
+
116
+
117
+ def esmfold_structure_module_only_150M_270K():
118
+ """
119
+ ESMFold baseline model using 150M ESM-2, 0 folding blocks.
120
+ ESM-2 here is trained out to 270K updates.
121
+ This is a model designed to test the capabilities of the language model
122
+ when ablated for number of parameters in the language model.
123
+ See table S1 in (Lin et al, 2022).
124
+ """
125
+ return _load_model("esmfold_structure_module_only_150M_270K")
126
+
127
+
128
+ def esmfold_structure_module_only_650M():
129
+ """
130
+ ESMFold baseline model using 650M ESM-2, 0 folding blocks.
131
+ ESM-2 here is trained out to 500K updates.
132
+ This is a model designed to test the capabilities of the language model
133
+ when ablated for number of parameters in the language model.
134
+ See table S1 in (Lin et al, 2022).
135
+ """
136
+ return _load_model("esmfold_structure_module_only_650M")
137
+
138
+
139
+ def esmfold_structure_module_only_650M_270K():
140
+ """
141
+ ESMFold baseline model using 650M ESM-2, 0 folding blocks.
142
+ ESM-2 here is trained out to 270K updates.
143
+ This is a model designed to test the capabilities of the language model
144
+ when ablated for number of parameters in the language model.
145
+ See table S1 in (Lin et al, 2022).
146
+ """
147
+ return _load_model("esmfold_structure_module_only_650M_270K")
148
+
149
+
150
+ def esmfold_structure_module_only_3B():
151
+ """
152
+ ESMFold baseline model using 3B ESM-2, 0 folding blocks.
153
+ ESM-2 here is trained out to 500K updates.
154
+ This is a model designed to test the capabilities of the language model
155
+ when ablated for number of parameters in the language model.
156
+ See table S1 in (Lin et al, 2022).
157
+ """
158
+ return _load_model("esmfold_structure_module_only_3B")
159
+
160
+
161
+ def esmfold_structure_module_only_3B_270K():
162
+ """
163
+ ESMFold baseline model using 3B ESM-2, 0 folding blocks.
164
+ ESM-2 here is trained out to 270K updates.
165
+ This is a model designed to test the capabilities of the language model
166
+ when ablated for number of parameters in the language model.
167
+ See table S1 in (Lin et al, 2022).
168
+ """
169
+ return _load_model("esmfold_structure_module_only_3B_270K")
170
+
171
+
172
+ def esmfold_structure_module_only_15B():
173
+ """
174
+ ESMFold baseline model using 15B ESM-2, 0 folding blocks.
175
+ ESM-2 here is trained out to 270K updates.
176
+ The 15B parameter ESM-2 was not trained out to 500K updates
177
+ This is a model designed to test the capabilities of the language model
178
+ when ablated for number of parameters in the language model.
179
+ See table S1 in (Lin et al, 2022).
180
+ """
181
+ return _load_model("esmfold_structure_module_only_15B")
esm/source/esm/esmfold/v1/tri_self_attn_block.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+ import torch
6
+ from openfold.model.triangular_attention import (
7
+ TriangleAttentionEndingNode,
8
+ TriangleAttentionStartingNode,
9
+ )
10
+ from openfold.model.triangular_multiplicative_update import (
11
+ TriangleMultiplicationIncoming,
12
+ TriangleMultiplicationOutgoing,
13
+ )
14
+ from torch import nn
15
+
16
+ from esm.esmfold.v1.misc import (
17
+ Attention,
18
+ Dropout,
19
+ PairToSequence,
20
+ ResidueMLP,
21
+ SequenceToPair,
22
+ )
23
+
24
+
25
+ class TriangularSelfAttentionBlock(nn.Module):
26
+ def __init__(
27
+ self,
28
+ sequence_state_dim,
29
+ pairwise_state_dim,
30
+ sequence_head_width,
31
+ pairwise_head_width,
32
+ dropout=0,
33
+ **__kwargs,
34
+ ):
35
+ super().__init__()
36
+
37
+ assert sequence_state_dim % sequence_head_width == 0
38
+ assert pairwise_state_dim % pairwise_head_width == 0
39
+ sequence_num_heads = sequence_state_dim // sequence_head_width
40
+ pairwise_num_heads = pairwise_state_dim // pairwise_head_width
41
+ assert sequence_state_dim == sequence_num_heads * sequence_head_width
42
+ assert pairwise_state_dim == pairwise_num_heads * pairwise_head_width
43
+ assert pairwise_state_dim % 2 == 0
44
+
45
+ self.sequence_state_dim = sequence_state_dim
46
+ self.pairwise_state_dim = pairwise_state_dim
47
+
48
+ self.layernorm_1 = nn.LayerNorm(sequence_state_dim)
49
+
50
+ self.sequence_to_pair = SequenceToPair(
51
+ sequence_state_dim, pairwise_state_dim // 2, pairwise_state_dim
52
+ )
53
+ self.pair_to_sequence = PairToSequence(pairwise_state_dim, sequence_num_heads)
54
+
55
+ self.seq_attention = Attention(
56
+ sequence_state_dim, sequence_num_heads, sequence_head_width, gated=True
57
+ )
58
+ self.tri_mul_out = TriangleMultiplicationOutgoing(
59
+ pairwise_state_dim,
60
+ pairwise_state_dim,
61
+ )
62
+ self.tri_mul_in = TriangleMultiplicationIncoming(
63
+ pairwise_state_dim,
64
+ pairwise_state_dim,
65
+ )
66
+ self.tri_att_start = TriangleAttentionStartingNode(
67
+ pairwise_state_dim,
68
+ pairwise_head_width,
69
+ pairwise_num_heads,
70
+ inf=1e9,
71
+ ) # type: ignore
72
+ self.tri_att_end = TriangleAttentionEndingNode(
73
+ pairwise_state_dim,
74
+ pairwise_head_width,
75
+ pairwise_num_heads,
76
+ inf=1e9,
77
+ ) # type: ignore
78
+
79
+ self.mlp_seq = ResidueMLP(sequence_state_dim, 4 * sequence_state_dim, dropout=dropout)
80
+ self.mlp_pair = ResidueMLP(pairwise_state_dim, 4 * pairwise_state_dim, dropout=dropout)
81
+
82
+ assert dropout < 0.4
83
+ self.drop = nn.Dropout(dropout)
84
+ self.row_drop = Dropout(dropout * 2, 2)
85
+ self.col_drop = Dropout(dropout * 2, 1)
86
+
87
+ torch.nn.init.zeros_(self.tri_mul_in.linear_z.weight)
88
+ torch.nn.init.zeros_(self.tri_mul_in.linear_z.bias)
89
+ torch.nn.init.zeros_(self.tri_mul_out.linear_z.weight)
90
+ torch.nn.init.zeros_(self.tri_mul_out.linear_z.bias)
91
+ torch.nn.init.zeros_(self.tri_att_start.mha.linear_o.weight)
92
+ torch.nn.init.zeros_(self.tri_att_start.mha.linear_o.bias)
93
+ torch.nn.init.zeros_(self.tri_att_end.mha.linear_o.weight)
94
+ torch.nn.init.zeros_(self.tri_att_end.mha.linear_o.bias)
95
+
96
+ torch.nn.init.zeros_(self.sequence_to_pair.o_proj.weight)
97
+ torch.nn.init.zeros_(self.sequence_to_pair.o_proj.bias)
98
+ torch.nn.init.zeros_(self.pair_to_sequence.linear.weight)
99
+ torch.nn.init.zeros_(self.seq_attention.o_proj.weight)
100
+ torch.nn.init.zeros_(self.seq_attention.o_proj.bias)
101
+ torch.nn.init.zeros_(self.mlp_seq.mlp[-2].weight)
102
+ torch.nn.init.zeros_(self.mlp_seq.mlp[-2].bias)
103
+ torch.nn.init.zeros_(self.mlp_pair.mlp[-2].weight)
104
+ torch.nn.init.zeros_(self.mlp_pair.mlp[-2].bias)
105
+
106
+ def forward(self, sequence_state, pairwise_state, mask=None, chunk_size=None, **__kwargs):
107
+ """
108
+ Inputs:
109
+ sequence_state: B x L x sequence_state_dim
110
+ pairwise_state: B x L x L x pairwise_state_dim
111
+ mask: B x L boolean tensor of valid positions
112
+
113
+ Output:
114
+ sequence_state: B x L x sequence_state_dim
115
+ pairwise_state: B x L x L x pairwise_state_dim
116
+ """
117
+ assert len(sequence_state.shape) == 3
118
+ assert len(pairwise_state.shape) == 4
119
+ if mask is not None:
120
+ assert len(mask.shape) == 2
121
+
122
+ batch_dim, seq_dim, sequence_state_dim = sequence_state.shape
123
+ pairwise_state_dim = pairwise_state.shape[3]
124
+ assert sequence_state_dim == self.sequence_state_dim
125
+ assert pairwise_state_dim == self.pairwise_state_dim
126
+ assert batch_dim == pairwise_state.shape[0]
127
+ assert seq_dim == pairwise_state.shape[1]
128
+ assert seq_dim == pairwise_state.shape[2]
129
+
130
+ # Update sequence state
131
+ bias = self.pair_to_sequence(pairwise_state)
132
+
133
+ # Self attention with bias + mlp.
134
+ y = self.layernorm_1(sequence_state)
135
+ y, _ = self.seq_attention(y, mask=mask, bias=bias)
136
+ sequence_state = sequence_state + self.drop(y)
137
+ sequence_state = self.mlp_seq(sequence_state)
138
+
139
+ # Update pairwise state
140
+ pairwise_state = pairwise_state + self.sequence_to_pair(sequence_state)
141
+
142
+ # Axial attention with triangular bias.
143
+ tri_mask = mask.unsqueeze(2) * mask.unsqueeze(1) if mask is not None else None
144
+ pairwise_state = pairwise_state + self.row_drop(
145
+ self.tri_mul_out(pairwise_state, mask=tri_mask)
146
+ )
147
+ pairwise_state = pairwise_state + self.col_drop(
148
+ self.tri_mul_in(pairwise_state, mask=tri_mask)
149
+ )
150
+ pairwise_state = pairwise_state + self.row_drop(
151
+ self.tri_att_start(pairwise_state, mask=tri_mask, chunk_size=chunk_size)
152
+ )
153
+ pairwise_state = pairwise_state + self.col_drop(
154
+ self.tri_att_end(pairwise_state, mask=tri_mask, chunk_size=chunk_size)
155
+ )
156
+
157
+ # MLP over pairs.
158
+ pairwise_state = self.mlp_pair(pairwise_state)
159
+
160
+ return sequence_state, pairwise_state
esm/source/esm/esmfold/v1/trunk.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+ import typing as T
6
+ from contextlib import ExitStack
7
+ from dataclasses import dataclass
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ from openfold.model.structure_module import StructureModule
12
+
13
+ from esm.esmfold.v1.tri_self_attn_block import TriangularSelfAttentionBlock
14
+
15
+
16
+ @dataclass
17
+ class StructureModuleConfig:
18
+ c_s: int = 384
19
+ c_z: int = 128
20
+ c_ipa: int = 16
21
+ c_resnet: int = 128
22
+ no_heads_ipa: int = 12
23
+ no_qk_points: int = 4
24
+ no_v_points: int = 8
25
+ dropout_rate: float = 0.1
26
+ no_blocks: int = 8
27
+ no_transition_layers: int = 1
28
+ no_resnet_blocks: int = 2
29
+ no_angles: int = 7
30
+ trans_scale_factor: int = 10
31
+ epsilon: float = 1e-8
32
+ inf: float = 1e5
33
+
34
+
35
+ @dataclass
36
+ class FoldingTrunkConfig:
37
+ _name: str = "FoldingTrunkConfig"
38
+ num_blocks: int = 48
39
+ sequence_state_dim: int = 1024
40
+ pairwise_state_dim: int = 128
41
+ sequence_head_width: int = 32
42
+ pairwise_head_width: int = 32
43
+ position_bins: int = 32
44
+ dropout: float = 0
45
+ layer_drop: float = 0
46
+ cpu_grad_checkpoint: bool = False
47
+
48
+ max_recycles: int = 4
49
+ chunk_size: T.Optional[int] = None
50
+
51
+ structure_module: StructureModuleConfig = StructureModuleConfig()
52
+
53
+
54
+ def get_axial_mask(mask):
55
+ """
56
+ Helper to convert B x L mask of valid positions to axial mask used
57
+ in row column attentions.
58
+
59
+ Input:
60
+ mask: B x L tensor of booleans
61
+
62
+ Output:
63
+ mask: B x L x L tensor of booleans
64
+ """
65
+
66
+ if mask is None:
67
+ return None
68
+ assert len(mask.shape) == 2
69
+ batch_dim, seq_dim = mask.shape
70
+ m = mask.unsqueeze(1).expand(batch_dim, seq_dim, seq_dim)
71
+ m = m.reshape(batch_dim * seq_dim, seq_dim)
72
+ return m
73
+
74
+
75
+ class RelativePosition(nn.Module):
76
+ def __init__(self, bins, pairwise_state_dim):
77
+ super().__init__()
78
+ self.bins = bins
79
+
80
+ # Note an additional offset is used so that the 0th position
81
+ # is reserved for masked pairs.
82
+ self.embedding = torch.nn.Embedding(2 * bins + 2, pairwise_state_dim)
83
+
84
+ def forward(self, residue_index, mask=None):
85
+ """
86
+ Input:
87
+ residue_index: B x L tensor of indices (dytpe=torch.long)
88
+ mask: B x L tensor of booleans
89
+
90
+ Output:
91
+ pairwise_state: B x L x L x pairwise_state_dim tensor of embeddings
92
+ """
93
+
94
+ assert residue_index.dtype == torch.long
95
+ if mask is not None:
96
+ assert residue_index.shape == mask.shape
97
+
98
+ diff = residue_index[:, None, :] - residue_index[:, :, None]
99
+ diff = diff.clamp(-self.bins, self.bins)
100
+ diff = diff + self.bins + 1 # Add 1 to adjust for padding index.
101
+
102
+ if mask is not None:
103
+ mask = mask[:, None, :] * mask[:, :, None]
104
+ diff[mask == False] = 0
105
+
106
+ output = self.embedding(diff)
107
+ return output
108
+
109
+
110
+ class FoldingTrunk(nn.Module):
111
+ def __init__(self, **kwargs):
112
+ super().__init__()
113
+ self.cfg = FoldingTrunkConfig(**kwargs)
114
+ assert self.cfg.max_recycles > 0
115
+
116
+ c_s = self.cfg.sequence_state_dim
117
+ c_z = self.cfg.pairwise_state_dim
118
+
119
+ assert c_s % self.cfg.sequence_head_width == 0
120
+ assert c_z % self.cfg.pairwise_head_width == 0
121
+ block = TriangularSelfAttentionBlock
122
+
123
+ self.pairwise_positional_embedding = RelativePosition(self.cfg.position_bins, c_z)
124
+
125
+ self.blocks = nn.ModuleList(
126
+ [
127
+ block(
128
+ sequence_state_dim=c_s,
129
+ pairwise_state_dim=c_z,
130
+ sequence_head_width=self.cfg.sequence_head_width,
131
+ pairwise_head_width=self.cfg.pairwise_head_width,
132
+ dropout=self.cfg.dropout,
133
+ )
134
+ for i in range(self.cfg.num_blocks)
135
+ ]
136
+ )
137
+
138
+ self.recycle_bins = 15
139
+ self.recycle_s_norm = nn.LayerNorm(c_s)
140
+ self.recycle_z_norm = nn.LayerNorm(c_z)
141
+ self.recycle_disto = nn.Embedding(self.recycle_bins, c_z)
142
+ self.recycle_disto.weight[0].detach().zero_()
143
+
144
+ self.structure_module = StructureModule(**self.cfg.structure_module) # type: ignore
145
+ self.trunk2sm_s = nn.Linear(c_s, self.structure_module.c_s)
146
+ self.trunk2sm_z = nn.Linear(c_z, self.structure_module.c_z)
147
+
148
+ self.chunk_size = self.cfg.chunk_size
149
+
150
+ def set_chunk_size(self, chunk_size):
151
+ # This parameter means the axial attention will be computed
152
+ # in a chunked manner. This should make the memory used more or less O(L) instead of O(L^2).
153
+ # It's equivalent to running a for loop over chunks of the dimension we're iterative over,
154
+ # where the chunk_size is the size of the chunks, so 128 would mean to parse 128-lengthed chunks.
155
+ self.chunk_size = chunk_size
156
+
157
+ def forward(self, seq_feats, pair_feats, true_aa, residx, mask, no_recycles: T.Optional[int] = None):
158
+ """
159
+ Inputs:
160
+ seq_feats: B x L x C tensor of sequence features
161
+ pair_feats: B x L x L x C tensor of pair features
162
+ residx: B x L long tensor giving the position in the sequence
163
+ mask: B x L boolean tensor indicating valid residues
164
+
165
+ Output:
166
+ predicted_structure: B x L x (num_atoms_per_residue * 3) tensor wrapped in a Coordinates object
167
+ """
168
+
169
+ device = seq_feats.device
170
+ s_s_0 = seq_feats
171
+ s_z_0 = pair_feats
172
+
173
+ if no_recycles is None:
174
+ no_recycles = self.cfg.max_recycles
175
+ else:
176
+ assert no_recycles >= 0, "Number of recycles must not be negative."
177
+ no_recycles += 1 # First 'recycle' is just the standard forward pass through the model.
178
+
179
+ def trunk_iter(s, z, residx, mask):
180
+ z = z + self.pairwise_positional_embedding(residx, mask=mask)
181
+
182
+ for block in self.blocks:
183
+ s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size)
184
+ return s, z
185
+
186
+ s_s = s_s_0
187
+ s_z = s_z_0
188
+ recycle_s = torch.zeros_like(s_s)
189
+ recycle_z = torch.zeros_like(s_z)
190
+ recycle_bins = torch.zeros(*s_z.shape[:-1], device=device, dtype=torch.int64)
191
+
192
+ assert no_recycles > 0
193
+ for recycle_idx in range(no_recycles):
194
+ with ExitStack() if recycle_idx == no_recycles - 1 else torch.no_grad():
195
+ # === Recycling ===
196
+ recycle_s = self.recycle_s_norm(recycle_s.detach())
197
+ recycle_z = self.recycle_z_norm(recycle_z.detach())
198
+ recycle_z += self.recycle_disto(recycle_bins.detach())
199
+
200
+ s_s, s_z = trunk_iter(s_s_0 + recycle_s, s_z_0 + recycle_z, residx, mask)
201
+
202
+ # === Structure module ===
203
+ structure = self.structure_module(
204
+ {"single": self.trunk2sm_s(s_s), "pair": self.trunk2sm_z(s_z)},
205
+ true_aa,
206
+ mask.float(),
207
+ )
208
+
209
+ recycle_s = s_s
210
+ recycle_z = s_z
211
+ # Distogram needs the N, CA, C coordinates, and bin constants same as alphafold.
212
+ recycle_bins = FoldingTrunk.distogram(
213
+ structure["positions"][-1][:, :, :3],
214
+ 3.375,
215
+ 21.375,
216
+ self.recycle_bins,
217
+ )
218
+
219
+ assert isinstance(structure, dict) # type: ignore
220
+ structure["s_s"] = s_s
221
+ structure["s_z"] = s_z
222
+
223
+ return structure
224
+
225
+ @staticmethod
226
+ def distogram(coords, min_bin, max_bin, num_bins):
227
+ # Coords are [... L x 3 x 3], where it's [N, CA, C] x 3 coordinates.
228
+ boundaries = torch.linspace(
229
+ min_bin,
230
+ max_bin,
231
+ num_bins - 1,
232
+ device=coords.device,
233
+ )
234
+ boundaries = boundaries**2
235
+ N, CA, C = [x.squeeze(-2) for x in coords.chunk(3, dim=-2)]
236
+ # Infer CB coordinates.
237
+ b = CA - N
238
+ c = C - CA
239
+ a = b.cross(c, dim=-1)
240
+ CB = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + CA
241
+ dists = (CB[..., None, :, :] - CB[..., :, None, :]).pow(2).sum(dim=-1, keepdims=True)
242
+ bins = torch.sum(dists > boundaries, dim=-1) # [..., L, L]
243
+ return bins
esm/source/esm/inverse_folding/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from . import gvp_transformer
7
+ from . import util
8
+ from . import multichain_util
esm/source/esm/inverse_folding/features.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+ #
6
+ # Portions of this file were adapted from the open source code for the following
7
+ # two papers:
8
+ #
9
+ # Ingraham, J., Garg, V., Barzilay, R., & Jaakkola, T. (2019). Generative
10
+ # models for graph-based protein design. Advances in Neural Information
11
+ # Processing Systems, 32.
12
+ #
13
+ # Jing, B., Eismann, S., Suriana, P., Townshend, R. J. L., & Dror, R. (2020).
14
+ # Learning from Protein Structure with Geometric Vector Perceptrons. In
15
+ # International Conference on Learning Representations.
16
+ #
17
+ # MIT License
18
+ #
19
+ # Copyright (c) 2020 Bowen Jing, Stephan Eismann, Patricia Suriana, Raphael Townshend, Ron Dror
20
+ #
21
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
22
+ # of this software and associated documentation files (the "Software"), to deal
23
+ # in the Software without restriction, including without limitation the rights
24
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
25
+ # copies of the Software, and to permit persons to whom the Software is
26
+ # furnished to do so, subject to the following conditions:
27
+ #
28
+ # The above copyright notice and this permission notice shall be included in all
29
+ # copies or substantial portions of the Software.
30
+ #
31
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37
+ # SOFTWARE.
38
+ #
39
+ # ================================================================
40
+ # The below license applies to the portions of the code (parts of
41
+ # src/datasets.py and src/models.py) adapted from Ingraham, et al.
42
+ # ================================================================
43
+ #
44
+ # MIT License
45
+ #
46
+ # Copyright (c) 2019 John Ingraham, Vikas Garg, Regina Barzilay, Tommi Jaakkola
47
+ #
48
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
49
+ # of this software and associated documentation files (the "Software"), to deal
50
+ # in the Software without restriction, including without limitation the rights
51
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
52
+ # copies of the Software, and to permit persons to whom the Software is
53
+ # furnished to do so, subject to the following conditions:
54
+ #
55
+ # The above copyright notice and this permission notice shall be included in all
56
+ # copies or substantial portions of the Software.
57
+ #
58
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
59
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
63
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
64
+ # SOFTWARE.
65
+
66
+ import math
67
+ import numpy as np
68
+ import torch
69
+ import torch.nn as nn
70
+ import torch.nn.functional as F
71
+
72
+ from .gvp_utils import flatten_graph
73
+ from .gvp_modules import GVP, LayerNorm
74
+ from .util import normalize, norm, nan_to_num, rbf
75
+
76
+
77
+ class GVPInputFeaturizer(nn.Module):
78
+
79
+ @staticmethod
80
+ def get_node_features(coords, coord_mask, with_coord_mask=True):
81
+ # scalar features
82
+ node_scalar_features = GVPInputFeaturizer._dihedrals(coords)
83
+ if with_coord_mask:
84
+ node_scalar_features = torch.cat([
85
+ node_scalar_features,
86
+ coord_mask.float().unsqueeze(-1)
87
+ ], dim=-1)
88
+ # vector features
89
+ X_ca = coords[:, :, 1]
90
+ orientations = GVPInputFeaturizer._orientations(X_ca)
91
+ sidechains = GVPInputFeaturizer._sidechains(coords)
92
+ node_vector_features = torch.cat([orientations, sidechains.unsqueeze(-2)], dim=-2)
93
+ return node_scalar_features, node_vector_features
94
+
95
+ @staticmethod
96
+ def _orientations(X):
97
+ forward = normalize(X[:, 1:] - X[:, :-1])
98
+ backward = normalize(X[:, :-1] - X[:, 1:])
99
+ forward = F.pad(forward, [0, 0, 0, 1])
100
+ backward = F.pad(backward, [0, 0, 1, 0])
101
+ return torch.cat([forward.unsqueeze(-2), backward.unsqueeze(-2)], -2)
102
+
103
+ @staticmethod
104
+ def _sidechains(X):
105
+ n, origin, c = X[:, :, 0], X[:, :, 1], X[:, :, 2]
106
+ c, n = normalize(c - origin), normalize(n - origin)
107
+ bisector = normalize(c + n)
108
+ perp = normalize(torch.cross(c, n, dim=-1))
109
+ vec = -bisector * math.sqrt(1 / 3) - perp * math.sqrt(2 / 3)
110
+ return vec
111
+
112
+ @staticmethod
113
+ def _dihedrals(X, eps=1e-7):
114
+ X = torch.flatten(X[:, :, :3], 1, 2)
115
+ bsz = X.shape[0]
116
+ dX = X[:, 1:] - X[:, :-1]
117
+ U = normalize(dX, dim=-1)
118
+ u_2 = U[:, :-2]
119
+ u_1 = U[:, 1:-1]
120
+ u_0 = U[:, 2:]
121
+
122
+ # Backbone normals
123
+ n_2 = normalize(torch.cross(u_2, u_1, dim=-1), dim=-1)
124
+ n_1 = normalize(torch.cross(u_1, u_0, dim=-1), dim=-1)
125
+
126
+ # Angle between normals
127
+ cosD = torch.sum(n_2 * n_1, -1)
128
+ cosD = torch.clamp(cosD, -1 + eps, 1 - eps)
129
+ D = torch.sign(torch.sum(u_2 * n_1, -1)) * torch.acos(cosD)
130
+
131
+ # This scheme will remove phi[0], psi[-1], omega[-1]
132
+ D = F.pad(D, [1, 2])
133
+ D = torch.reshape(D, [bsz, -1, 3])
134
+ # Lift angle representations to the circle
135
+ D_features = torch.cat([torch.cos(D), torch.sin(D)], -1)
136
+ return D_features
137
+
138
+ @staticmethod
139
+ def _positional_embeddings(edge_index,
140
+ num_embeddings=None,
141
+ num_positional_embeddings=16,
142
+ period_range=[2, 1000]):
143
+ # From https://github.com/jingraham/neurips19-graph-protein-design
144
+ num_embeddings = num_embeddings or num_positional_embeddings
145
+ d = edge_index[0] - edge_index[1]
146
+
147
+ frequency = torch.exp(
148
+ torch.arange(0, num_embeddings, 2, dtype=torch.float32,
149
+ device=edge_index.device)
150
+ * -(np.log(10000.0) / num_embeddings)
151
+ )
152
+ angles = d.unsqueeze(-1) * frequency
153
+ E = torch.cat((torch.cos(angles), torch.sin(angles)), -1)
154
+ return E
155
+
156
+ @staticmethod
157
+ def _dist(X, coord_mask, padding_mask, top_k_neighbors, eps=1e-8):
158
+ """ Pairwise euclidean distances """
159
+ bsz, maxlen = X.size(0), X.size(1)
160
+ coord_mask_2D = torch.unsqueeze(coord_mask,1) * torch.unsqueeze(coord_mask,2)
161
+ residue_mask = ~padding_mask
162
+ residue_mask_2D = torch.unsqueeze(residue_mask,1) * torch.unsqueeze(residue_mask,2)
163
+ dX = torch.unsqueeze(X,1) - torch.unsqueeze(X,2)
164
+ D = coord_mask_2D * norm(dX, dim=-1)
165
+
166
+ # sorting preference: first those with coords, then among the residues that
167
+ # exist but are masked use distance in sequence as tie breaker, and then the
168
+ # residues that came from padding are last
169
+ seqpos = torch.arange(maxlen, device=X.device)
170
+ Dseq = torch.abs(seqpos.unsqueeze(1) - seqpos.unsqueeze(0)).repeat(bsz, 1, 1)
171
+ D_adjust = nan_to_num(D) + (~coord_mask_2D) * (1e8 + Dseq*1e6) + (
172
+ ~residue_mask_2D) * (1e10)
173
+
174
+ if top_k_neighbors == -1:
175
+ D_neighbors = D_adjust
176
+ E_idx = seqpos.repeat(
177
+ *D_neighbors.shape[:-1], 1)
178
+ else:
179
+ # Identify k nearest neighbors (including self)
180
+ k = min(top_k_neighbors, X.size(1))
181
+ D_neighbors, E_idx = torch.topk(D_adjust, k, dim=-1, largest=False)
182
+
183
+ coord_mask_neighbors = (D_neighbors < 5e7)
184
+ residue_mask_neighbors = (D_neighbors < 5e9)
185
+ return D_neighbors, E_idx, coord_mask_neighbors, residue_mask_neighbors
186
+
187
+
188
+ class Normalize(nn.Module):
189
+ def __init__(self, features, epsilon=1e-6):
190
+ super(Normalize, self).__init__()
191
+ self.gain = nn.Parameter(torch.ones(features))
192
+ self.bias = nn.Parameter(torch.zeros(features))
193
+ self.epsilon = epsilon
194
+
195
+ def forward(self, x, dim=-1):
196
+ mu = x.mean(dim, keepdim=True)
197
+ sigma = torch.sqrt(x.var(dim, keepdim=True) + self.epsilon)
198
+ gain = self.gain
199
+ bias = self.bias
200
+ # Reshape
201
+ if dim != -1:
202
+ shape = [1] * len(mu.size())
203
+ shape[dim] = self.gain.size()[0]
204
+ gain = gain.view(shape)
205
+ bias = bias.view(shape)
206
+ return gain * (x - mu) / (sigma + self.epsilon) + bias
207
+
208
+
209
+ class DihedralFeatures(nn.Module):
210
+ def __init__(self, node_embed_dim):
211
+ """ Embed dihedral angle features. """
212
+ super(DihedralFeatures, self).__init__()
213
+ # 3 dihedral angles; sin and cos of each angle
214
+ node_in = 6
215
+ # Normalization and embedding
216
+ self.node_embedding = nn.Linear(node_in, node_embed_dim, bias=True)
217
+ self.norm_nodes = Normalize(node_embed_dim)
218
+
219
+ def forward(self, X):
220
+ """ Featurize coordinates as an attributed graph """
221
+ V = self._dihedrals(X)
222
+ V = self.node_embedding(V)
223
+ V = self.norm_nodes(V)
224
+ return V
225
+
226
+ @staticmethod
227
+ def _dihedrals(X, eps=1e-7, return_angles=False):
228
+ # First 3 coordinates are N, CA, C
229
+ X = X[:,:,:3,:].reshape(X.shape[0], 3*X.shape[1], 3)
230
+
231
+ # Shifted slices of unit vectors
232
+ dX = X[:,1:,:] - X[:,:-1,:]
233
+ U = F.normalize(dX, dim=-1)
234
+ u_2 = U[:,:-2,:]
235
+ u_1 = U[:,1:-1,:]
236
+ u_0 = U[:,2:,:]
237
+ # Backbone normals
238
+ n_2 = F.normalize(torch.cross(u_2, u_1, dim=-1), dim=-1)
239
+ n_1 = F.normalize(torch.cross(u_1, u_0, dim=-1), dim=-1)
240
+
241
+ # Angle between normals
242
+ cosD = (n_2 * n_1).sum(-1)
243
+ cosD = torch.clamp(cosD, -1+eps, 1-eps)
244
+ D = torch.sign((u_2 * n_1).sum(-1)) * torch.acos(cosD)
245
+
246
+ # This scheme will remove phi[0], psi[-1], omega[-1]
247
+ D = F.pad(D, (1,2), 'constant', 0)
248
+ D = D.view((D.size(0), int(D.size(1)/3), 3))
249
+ phi, psi, omega = torch.unbind(D,-1)
250
+
251
+ if return_angles:
252
+ return phi, psi, omega
253
+
254
+ # Lift angle representations to the circle
255
+ D_features = torch.cat((torch.cos(D), torch.sin(D)), 2)
256
+ return D_features
257
+
258
+
259
+ class GVPGraphEmbedding(GVPInputFeaturizer):
260
+
261
+ def __init__(self, args):
262
+ super().__init__()
263
+ self.top_k_neighbors = args.top_k_neighbors
264
+ self.num_positional_embeddings = 16
265
+ self.remove_edges_without_coords = True
266
+ node_input_dim = (7, 3)
267
+ edge_input_dim = (34, 1)
268
+ node_hidden_dim = (args.node_hidden_dim_scalar,
269
+ args.node_hidden_dim_vector)
270
+ edge_hidden_dim = (args.edge_hidden_dim_scalar,
271
+ args.edge_hidden_dim_vector)
272
+ self.embed_node = nn.Sequential(
273
+ GVP(node_input_dim, node_hidden_dim, activations=(None, None)),
274
+ LayerNorm(node_hidden_dim, eps=1e-4)
275
+ )
276
+ self.embed_edge = nn.Sequential(
277
+ GVP(edge_input_dim, edge_hidden_dim, activations=(None, None)),
278
+ LayerNorm(edge_hidden_dim, eps=1e-4)
279
+ )
280
+ self.embed_confidence = nn.Linear(16, args.node_hidden_dim_scalar)
281
+
282
+ def forward(self, coords, coord_mask, padding_mask, confidence):
283
+ with torch.no_grad():
284
+ node_features = self.get_node_features(coords, coord_mask)
285
+ edge_features, edge_index = self.get_edge_features(
286
+ coords, coord_mask, padding_mask)
287
+ node_embeddings_scalar, node_embeddings_vector = self.embed_node(node_features)
288
+ edge_embeddings = self.embed_edge(edge_features)
289
+
290
+ rbf_rep = rbf(confidence, 0., 1.)
291
+ node_embeddings = (
292
+ node_embeddings_scalar + self.embed_confidence(rbf_rep),
293
+ node_embeddings_vector
294
+ )
295
+
296
+ node_embeddings, edge_embeddings, edge_index = flatten_graph(
297
+ node_embeddings, edge_embeddings, edge_index)
298
+ return node_embeddings, edge_embeddings, edge_index
299
+
300
+ def get_edge_features(self, coords, coord_mask, padding_mask):
301
+ X_ca = coords[:, :, 1]
302
+ # Get distances to the top k neighbors
303
+ E_dist, E_idx, E_coord_mask, E_residue_mask = GVPInputFeaturizer._dist(
304
+ X_ca, coord_mask, padding_mask, self.top_k_neighbors)
305
+ # Flatten the graph to be batch size 1 for torch_geometric package
306
+ dest = E_idx
307
+ B, L, k = E_idx.shape[:3]
308
+ src = torch.arange(L, device=E_idx.device).view([1, L, 1]).expand(B, L, k)
309
+ # After flattening, [2, B, E]
310
+ edge_index = torch.stack([src, dest], dim=0).flatten(2, 3)
311
+ # After flattening, [B, E]
312
+ E_dist = E_dist.flatten(1, 2)
313
+ E_coord_mask = E_coord_mask.flatten(1, 2).unsqueeze(-1)
314
+ E_residue_mask = E_residue_mask.flatten(1, 2)
315
+ # Calculate relative positional embeddings and distance RBF
316
+ pos_embeddings = GVPInputFeaturizer._positional_embeddings(
317
+ edge_index,
318
+ num_positional_embeddings=self.num_positional_embeddings,
319
+ )
320
+ D_rbf = rbf(E_dist, 0., 20.)
321
+ # Calculate relative orientation
322
+ X_src = X_ca.unsqueeze(2).expand(-1, -1, k, -1).flatten(1, 2)
323
+ X_dest = torch.gather(
324
+ X_ca,
325
+ 1,
326
+ edge_index[1, :, :].unsqueeze(-1).expand([B, L*k, 3])
327
+ )
328
+ coord_mask_src = coord_mask.unsqueeze(2).expand(-1, -1, k).flatten(1, 2)
329
+ coord_mask_dest = torch.gather(
330
+ coord_mask,
331
+ 1,
332
+ edge_index[1, :, :].expand([B, L*k])
333
+ )
334
+ E_vectors = X_src - X_dest
335
+ # For the ones without coordinates, substitute in the average vector
336
+ E_vector_mean = torch.sum(E_vectors * E_coord_mask, dim=1,
337
+ keepdims=True) / torch.sum(E_coord_mask, dim=1, keepdims=True)
338
+ E_vectors = E_vectors * E_coord_mask + E_vector_mean * ~(E_coord_mask)
339
+ # Normalize and remove nans
340
+ edge_s = torch.cat([D_rbf, pos_embeddings], dim=-1)
341
+ edge_v = normalize(E_vectors).unsqueeze(-2)
342
+ edge_s, edge_v = map(nan_to_num, (edge_s, edge_v))
343
+ # Also add indications of whether the coordinates are present
344
+ edge_s = torch.cat([
345
+ edge_s,
346
+ (~coord_mask_src).float().unsqueeze(-1),
347
+ (~coord_mask_dest).float().unsqueeze(-1),
348
+ ], dim=-1)
349
+ edge_index[:, ~E_residue_mask] = -1
350
+ if self.remove_edges_without_coords:
351
+ edge_index[:, ~E_coord_mask.squeeze(-1)] = -1
352
+ return (edge_s, edge_v), edge_index.transpose(0, 1)
esm/source/esm/inverse_folding/gvp_encoder.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from argparse import Namespace
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+
12
+ from .features import GVPGraphEmbedding
13
+ from .gvp_modules import GVPConvLayer, LayerNorm
14
+ from .gvp_utils import unflatten_graph
15
+
16
+
17
+
18
+ class GVPEncoder(nn.Module):
19
+
20
+ def __init__(self, args):
21
+ super().__init__()
22
+ self.args = args
23
+ self.embed_graph = GVPGraphEmbedding(args)
24
+
25
+ node_hidden_dim = (args.node_hidden_dim_scalar,
26
+ args.node_hidden_dim_vector)
27
+ edge_hidden_dim = (args.edge_hidden_dim_scalar,
28
+ args.edge_hidden_dim_vector)
29
+
30
+ conv_activations = (F.relu, torch.sigmoid)
31
+ self.encoder_layers = nn.ModuleList(
32
+ GVPConvLayer(
33
+ node_hidden_dim,
34
+ edge_hidden_dim,
35
+ drop_rate=args.dropout,
36
+ vector_gate=True,
37
+ attention_heads=0,
38
+ n_message=3,
39
+ conv_activations=conv_activations,
40
+ n_edge_gvps=0,
41
+ eps=1e-4,
42
+ layernorm=True,
43
+ )
44
+ for i in range(args.num_encoder_layers)
45
+ )
46
+
47
+ def forward(self, coords, coord_mask, padding_mask, confidence):
48
+ node_embeddings, edge_embeddings, edge_index = self.embed_graph(
49
+ coords, coord_mask, padding_mask, confidence)
50
+
51
+ for i, layer in enumerate(self.encoder_layers):
52
+ node_embeddings, edge_embeddings = layer(node_embeddings,
53
+ edge_index, edge_embeddings)
54
+
55
+ node_embeddings = unflatten_graph(node_embeddings, coords.shape[0])
56
+ return node_embeddings
esm/source/esm/inverse_folding/gvp_modules.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contents of this file are from the open source code for
2
+ #
3
+ # Jing, B., Eismann, S., Suriana, P., Townshend, R. J. L., & Dror, R. (2020).
4
+ # Learning from Protein Structure with Geometric Vector Perceptrons. In
5
+ # International Conference on Learning Representations.
6
+ #
7
+ # MIT License
8
+ #
9
+ # Copyright (c) 2020 Bowen Jing, Stephan Eismann, Patricia Suriana, Raphael Townshend, Ron Dror
10
+ #
11
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ # of this software and associated documentation files (the "Software"), to deal
13
+ # in the Software without restriction, including without limitation the rights
14
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ # copies of the Software, and to permit persons to whom the Software is
16
+ # furnished to do so, subject to the following conditions:
17
+ #
18
+ # The above copyright notice and this permission notice shall be included in all
19
+ # copies or substantial portions of the Software.
20
+ #
21
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ # SOFTWARE.
28
+
29
+ import typing as T
30
+ import torch
31
+ from torch import nn
32
+ import torch.nn.functional as F
33
+ from torch_geometric.nn import MessagePassing
34
+
35
+ def tuple_size(tp):
36
+ return tuple([0 if a is None else a.size() for a in tp])
37
+
38
+ def tuple_sum(tp1, tp2):
39
+ s1, v1 = tp1
40
+ s2, v2 = tp2
41
+ if v2 is None and v2 is None:
42
+ return (s1 + s2, None)
43
+ return (s1 + s2, v1 + v2)
44
+
45
+ def tuple_cat(*args, dim=-1):
46
+ '''
47
+ Concatenates any number of tuples (s, V) elementwise.
48
+
49
+ :param dim: dimension along which to concatenate when viewed
50
+ as the `dim` index for the scalar-channel tensors.
51
+ This means that `dim=-1` will be applied as
52
+ `dim=-2` for the vector-channel tensors.
53
+ '''
54
+ dim %= len(args[0][0].shape)
55
+ s_args, v_args = list(zip(*args))
56
+ return torch.cat(s_args, dim=dim), torch.cat(v_args, dim=dim)
57
+
58
+ def tuple_index(x, idx):
59
+ '''
60
+ Indexes into a tuple (s, V) along the first dimension.
61
+
62
+ :param idx: any object which can be used to index into a `torch.Tensor`
63
+ '''
64
+ return x[0][idx], x[1][idx]
65
+
66
+ def randn(n, dims, device="cpu"):
67
+ '''
68
+ Returns random tuples (s, V) drawn elementwise from a normal distribution.
69
+
70
+ :param n: number of data points
71
+ :param dims: tuple of dimensions (n_scalar, n_vector)
72
+
73
+ :return: (s, V) with s.shape = (n, n_scalar) and
74
+ V.shape = (n, n_vector, 3)
75
+ '''
76
+ return torch.randn(n, dims[0], device=device), \
77
+ torch.randn(n, dims[1], 3, device=device)
78
+
79
+ def _norm_no_nan(x, axis=-1, keepdims=False, eps=1e-8, sqrt=True):
80
+ '''
81
+ L2 norm of tensor clamped above a minimum value `eps`.
82
+
83
+ :param sqrt: if `False`, returns the square of the L2 norm
84
+ '''
85
+ # clamp is slow
86
+ # out = torch.clamp(torch.sum(torch.square(x), axis, keepdims), min=eps)
87
+ out = torch.sum(torch.square(x), axis, keepdims) + eps
88
+ return torch.sqrt(out) if sqrt else out
89
+
90
+ def _split(x, nv):
91
+ '''
92
+ Splits a merged representation of (s, V) back into a tuple.
93
+ Should be used only with `_merge(s, V)` and only if the tuple
94
+ representation cannot be used.
95
+
96
+ :param x: the `torch.Tensor` returned from `_merge`
97
+ :param nv: the number of vector channels in the input to `_merge`
98
+ '''
99
+ v = torch.reshape(x[..., -3*nv:], x.shape[:-1] + (nv, 3))
100
+ s = x[..., :-3*nv]
101
+ return s, v
102
+
103
+ def _merge(s, v):
104
+ '''
105
+ Merges a tuple (s, V) into a single `torch.Tensor`, where the
106
+ vector channels are flattened and appended to the scalar channels.
107
+ Should be used only if the tuple representation cannot be used.
108
+ Use `_split(x, nv)` to reverse.
109
+ '''
110
+ v = torch.reshape(v, v.shape[:-2] + (3*v.shape[-2],))
111
+ return torch.cat([s, v], -1)
112
+
113
+ class GVP(nn.Module):
114
+ '''
115
+ Geometric Vector Perceptron. See manuscript and README.md
116
+ for more details.
117
+
118
+ :param in_dims: tuple (n_scalar, n_vector)
119
+ :param out_dims: tuple (n_scalar, n_vector)
120
+ :param h_dim: intermediate number of vector channels, optional
121
+ :param activations: tuple of functions (scalar_act, vector_act)
122
+ :param tuple_io: whether to keep accepting tuple inputs and outputs when vi
123
+ or vo = 0
124
+ '''
125
+ def __init__(self, in_dims, out_dims, h_dim=None, vector_gate=False,
126
+ activations=(F.relu, torch.sigmoid), tuple_io=True,
127
+ eps=1e-8):
128
+ super(GVP, self).__init__()
129
+ self.si, self.vi = in_dims
130
+ self.so, self.vo = out_dims
131
+ self.tuple_io = tuple_io
132
+ if self.vi:
133
+ self.h_dim = h_dim or max(self.vi, self.vo)
134
+ self.wh = nn.Linear(self.vi, self.h_dim, bias=False)
135
+ self.ws = nn.Linear(self.h_dim + self.si, self.so)
136
+ if self.vo:
137
+ self.wv = nn.Linear(self.h_dim, self.vo, bias=False)
138
+ if vector_gate:
139
+ self.wg = nn.Linear(self.so, self.vo)
140
+ else:
141
+ self.ws = nn.Linear(self.si, self.so)
142
+
143
+ self.vector_gate = vector_gate
144
+ self.scalar_act, self.vector_act = activations
145
+ self.eps = eps
146
+
147
+ def forward(self, x):
148
+ '''
149
+ :param x: tuple (s, V) of `torch.Tensor`,
150
+ or (if vectors_in is 0), a single `torch.Tensor`
151
+ :return: tuple (s, V) of `torch.Tensor`,
152
+ or (if vectors_out is 0), a single `torch.Tensor`
153
+ '''
154
+ if self.vi:
155
+ s, v = x
156
+ v = torch.transpose(v, -1, -2)
157
+ vh = self.wh(v)
158
+ vn = _norm_no_nan(vh, axis=-2, eps=self.eps)
159
+ s = self.ws(torch.cat([s, vn], -1))
160
+ if self.scalar_act:
161
+ s = self.scalar_act(s)
162
+ if self.vo:
163
+ v = self.wv(vh)
164
+ v = torch.transpose(v, -1, -2)
165
+ if self.vector_gate:
166
+ g = self.wg(s).unsqueeze(-1)
167
+ else:
168
+ g = _norm_no_nan(v, axis=-1, keepdims=True, eps=self.eps)
169
+ if self.vector_act:
170
+ g = self.vector_act(g)
171
+ v = v * g
172
+ else:
173
+ if self.tuple_io:
174
+ assert x[1] is None
175
+ x = x[0]
176
+ s = self.ws(x)
177
+ if self.scalar_act:
178
+ s = self.scalar_act(s)
179
+ if self.vo:
180
+ v = torch.zeros(list(s.shape)[:-1] + [self.vo, 3],
181
+ device=s.device)
182
+
183
+ if self.vo:
184
+ return (s, v)
185
+ elif self.tuple_io:
186
+ return (s, None)
187
+ else:
188
+ return s
189
+
190
+
191
+ class _VDropout(nn.Module):
192
+ '''
193
+ Vector channel dropout where the elements of each
194
+ vector channel are dropped together.
195
+ '''
196
+ def __init__(self, drop_rate):
197
+ super(_VDropout, self).__init__()
198
+ self.drop_rate = drop_rate
199
+
200
+ def forward(self, x):
201
+ '''
202
+ :param x: `torch.Tensor` corresponding to vector channels
203
+ '''
204
+ if x is None:
205
+ return None
206
+ device = x.device
207
+ if not self.training:
208
+ return x
209
+ mask = torch.bernoulli(
210
+ (1 - self.drop_rate) * torch.ones(x.shape[:-1], device=device)
211
+ ).unsqueeze(-1)
212
+ x = mask * x / (1 - self.drop_rate)
213
+ return x
214
+
215
+ class Dropout(nn.Module):
216
+ '''
217
+ Combined dropout for tuples (s, V).
218
+ Takes tuples (s, V) as input and as output.
219
+ '''
220
+ def __init__(self, drop_rate):
221
+ super(Dropout, self).__init__()
222
+ self.sdropout = nn.Dropout(drop_rate)
223
+ self.vdropout = _VDropout(drop_rate)
224
+
225
+ def forward(self, x):
226
+ '''
227
+ :param x: tuple (s, V) of `torch.Tensor`,
228
+ or single `torch.Tensor`
229
+ (will be assumed to be scalar channels)
230
+ '''
231
+ if type(x) is torch.Tensor:
232
+ return self.sdropout(x)
233
+ s, v = x
234
+ return self.sdropout(s), self.vdropout(v)
235
+
236
+ class LayerNorm(nn.Module):
237
+ '''
238
+ Combined LayerNorm for tuples (s, V).
239
+ Takes tuples (s, V) as input and as output.
240
+ '''
241
+ def __init__(self, dims, tuple_io=True, eps=1e-8):
242
+ super(LayerNorm, self).__init__()
243
+ self.tuple_io = tuple_io
244
+ self.s, self.v = dims
245
+ self.scalar_norm = nn.LayerNorm(self.s)
246
+ self.eps = eps
247
+
248
+ def forward(self, x):
249
+ '''
250
+ :param x: tuple (s, V) of `torch.Tensor`,
251
+ or single `torch.Tensor`
252
+ (will be assumed to be scalar channels)
253
+ '''
254
+ if not self.v:
255
+ if self.tuple_io:
256
+ return self.scalar_norm(x[0]), None
257
+ return self.scalar_norm(x)
258
+ s, v = x
259
+ vn = _norm_no_nan(v, axis=-1, keepdims=True, sqrt=False, eps=self.eps)
260
+ nonzero_mask = (vn > 2 * self.eps)
261
+ vn = torch.sum(vn * nonzero_mask, dim=-2, keepdim=True
262
+ ) / (self.eps + torch.sum(nonzero_mask, dim=-2, keepdim=True))
263
+ vn = torch.sqrt(vn + self.eps)
264
+ v = nonzero_mask * (v / vn)
265
+ return self.scalar_norm(s), v
266
+
267
+ class GVPConv(MessagePassing):
268
+ '''
269
+ Graph convolution / message passing with Geometric Vector Perceptrons.
270
+ Takes in a graph with node and edge embeddings,
271
+ and returns new node embeddings.
272
+
273
+ This does NOT do residual updates and pointwise feedforward layers
274
+ ---see `GVPConvLayer`.
275
+
276
+ :param in_dims: input node embedding dimensions (n_scalar, n_vector)
277
+ :param out_dims: output node embedding dimensions (n_scalar, n_vector)
278
+ :param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
279
+ :param n_layers: number of GVPs in the message function
280
+ :param module_list: preconstructed message function, overrides n_layers
281
+ :param aggr: should be "add" if some incoming edges are masked, as in
282
+ a masked autoregressive decoder architecture
283
+ '''
284
+ def __init__(self, in_dims, out_dims, edge_dims, n_layers=3,
285
+ vector_gate=False, module_list=None, aggr="mean", eps=1e-8,
286
+ activations=(F.relu, torch.sigmoid)):
287
+ super(GVPConv, self).__init__(aggr=aggr)
288
+ self.eps = eps
289
+ self.si, self.vi = in_dims
290
+ self.so, self.vo = out_dims
291
+ self.se, self.ve = edge_dims
292
+
293
+ module_list = module_list or []
294
+ if not module_list:
295
+ if n_layers == 1:
296
+ module_list.append(
297
+ GVP((2*self.si + self.se, 2*self.vi + self.ve),
298
+ (self.so, self.vo), activations=(None, None)))
299
+ else:
300
+ module_list.append(
301
+ GVP((2*self.si + self.se, 2*self.vi + self.ve), out_dims,
302
+ vector_gate=vector_gate, activations=activations)
303
+ )
304
+ for i in range(n_layers - 2):
305
+ module_list.append(GVP(out_dims, out_dims,
306
+ vector_gate=vector_gate))
307
+ module_list.append(GVP(out_dims, out_dims,
308
+ activations=(None, None)))
309
+ self.message_func = nn.Sequential(*module_list)
310
+
311
+ def forward(self, x, edge_index, edge_attr):
312
+ '''
313
+ :param x: tuple (s, V) of `torch.Tensor`
314
+ :param edge_index: array of shape [2, n_edges]
315
+ :param edge_attr: tuple (s, V) of `torch.Tensor`
316
+ '''
317
+ x_s, x_v = x
318
+ message = self.propagate(edge_index,
319
+ s=x_s, v=x_v.reshape(x_v.shape[0], 3*x_v.shape[1]),
320
+ edge_attr=edge_attr)
321
+ return _split(message, self.vo)
322
+
323
+ def message(self, s_i, v_i, s_j, v_j, edge_attr):
324
+ v_j = v_j.view(v_j.shape[0], v_j.shape[1]//3, 3)
325
+ v_i = v_i.view(v_i.shape[0], v_i.shape[1]//3, 3)
326
+ message = tuple_cat((s_j, v_j), edge_attr, (s_i, v_i))
327
+ message = self.message_func(message)
328
+ return _merge(*message)
329
+
330
+
331
+ class GVPConvLayer(nn.Module):
332
+ '''
333
+ Full graph convolution / message passing layer with
334
+ Geometric Vector Perceptrons. Residually updates node embeddings with
335
+ aggregated incoming messages, applies a pointwise feedforward
336
+ network to node embeddings, and returns updated node embeddings.
337
+
338
+ To only compute the aggregated messages, see `GVPConv`.
339
+
340
+ :param node_dims: node embedding dimensions (n_scalar, n_vector)
341
+ :param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
342
+ :param n_message: number of GVPs to use in message function
343
+ :param n_feedforward: number of GVPs to use in feedforward function
344
+ :param drop_rate: drop probability in all dropout layers
345
+ :param autoregressive: if `True`, this `GVPConvLayer` will be used
346
+ with a different set of input node embeddings for messages
347
+ where src >= dst
348
+ '''
349
+ def __init__(self, node_dims, edge_dims, vector_gate=False,
350
+ n_message=3, n_feedforward=2, drop_rate=.1,
351
+ autoregressive=False, attention_heads=0,
352
+ conv_activations=(F.relu, torch.sigmoid),
353
+ n_edge_gvps=0, layernorm=True, eps=1e-8):
354
+
355
+ super(GVPConvLayer, self).__init__()
356
+ if attention_heads == 0:
357
+ self.conv = GVPConv(
358
+ node_dims, node_dims, edge_dims, n_layers=n_message,
359
+ vector_gate=vector_gate,
360
+ aggr="add" if autoregressive else "mean",
361
+ activations=conv_activations,
362
+ eps=eps,
363
+ )
364
+ else:
365
+ raise NotImplementedError
366
+ if layernorm:
367
+ self.norm = nn.ModuleList([LayerNorm(node_dims, eps=eps) for _ in range(2)])
368
+ else:
369
+ self.norm = nn.ModuleList([nn.Identity() for _ in range(2)])
370
+ self.dropout = nn.ModuleList([Dropout(drop_rate) for _ in range(2)])
371
+
372
+ ff_func = []
373
+ if n_feedforward == 1:
374
+ ff_func.append(GVP(node_dims, node_dims, activations=(None, None)))
375
+ else:
376
+ hid_dims = 4*node_dims[0], 2*node_dims[1]
377
+ ff_func.append(GVP(node_dims, hid_dims, vector_gate=vector_gate))
378
+ for i in range(n_feedforward-2):
379
+ ff_func.append(GVP(hid_dims, hid_dims, vector_gate=vector_gate))
380
+ ff_func.append(GVP(hid_dims, node_dims, activations=(None, None)))
381
+ self.ff_func = nn.Sequential(*ff_func)
382
+
383
+ self.edge_message_func = None
384
+ if n_edge_gvps > 0:
385
+ si, vi = node_dims
386
+ se, ve = edge_dims
387
+ module_list = [
388
+ GVP((2*si + se, 2*vi + ve), edge_dims, vector_gate=vector_gate)
389
+ ]
390
+ for i in range(n_edge_gvps - 2):
391
+ module_list.append(GVP(edge_dims, edge_dims,
392
+ vector_gate=vector_gate))
393
+ if n_edge_gvps > 1:
394
+ module_list.append(GVP(edge_dims, edge_dims,
395
+ activations=(None, None)))
396
+ self.edge_message_func = nn.Sequential(*module_list)
397
+ if layernorm:
398
+ self.edge_norm = LayerNorm(edge_dims, eps=eps)
399
+ else:
400
+ self.edge_norm = nn.Identity()
401
+ self.edge_dropout = Dropout(drop_rate)
402
+
403
+ def forward(self, x, edge_index, edge_attr,
404
+ autoregressive_x=None, node_mask=None):
405
+ '''
406
+ :param x: tuple (s, V) of `torch.Tensor`
407
+ :param edge_index: array of shape [2, n_edges]
408
+ :param edge_attr: tuple (s, V) of `torch.Tensor`
409
+ :param autoregressive_x: tuple (s, V) of `torch.Tensor`.
410
+ If not `None`, will be used as srcqq node embeddings
411
+ for forming messages where src >= dst. The corrent node
412
+ embeddings `x` will still be the base of the update and the
413
+ pointwise feedforward.
414
+ :param node_mask: array of type `bool` to index into the first
415
+ dim of node embeddings (s, V). If not `None`, only
416
+ these nodes will be updated.
417
+ '''
418
+ if self.edge_message_func:
419
+ src, dst = edge_index
420
+ if autoregressive_x is None:
421
+ x_src = x[0][src], x[1][src]
422
+ else:
423
+ mask = (src < dst).unsqueeze(-1)
424
+ x_src = (
425
+ torch.where(mask, x[0][src], autoregressive_x[0][src]),
426
+ torch.where(mask.unsqueeze(-1), x[1][src],
427
+ autoregressive_x[1][src])
428
+ )
429
+ x_dst = x[0][dst], x[1][dst]
430
+ x_edge = (
431
+ torch.cat([x_src[0], edge_attr[0], x_dst[0]], dim=-1),
432
+ torch.cat([x_src[1], edge_attr[1], x_dst[1]], dim=-2)
433
+ )
434
+ edge_attr_dh = self.edge_message_func(x_edge)
435
+ edge_attr = self.edge_norm(tuple_sum(edge_attr,
436
+ self.edge_dropout(edge_attr_dh)))
437
+
438
+ if autoregressive_x is not None:
439
+ # Guarding this import here to remove the dependency on torch_scatter, since this isn't used
440
+ # in ESM-IF1
441
+ from torch_scatter import scatter_add
442
+ src, dst = edge_index
443
+ mask = src < dst
444
+ edge_index_forward = edge_index[:, mask]
445
+ edge_index_backward = edge_index[:, ~mask]
446
+ edge_attr_forward = tuple_index(edge_attr, mask)
447
+ edge_attr_backward = tuple_index(edge_attr, ~mask)
448
+
449
+ dh = tuple_sum(
450
+ self.conv(x, edge_index_forward, edge_attr_forward),
451
+ self.conv(autoregressive_x, edge_index_backward, edge_attr_backward)
452
+ )
453
+
454
+ count = scatter_add(torch.ones_like(dst), dst,
455
+ dim_size=dh[0].size(0)).clamp(min=1).unsqueeze(-1)
456
+
457
+ dh = dh[0] / count, dh[1] / count.unsqueeze(-1)
458
+
459
+ else:
460
+ dh = self.conv(x, edge_index, edge_attr)
461
+
462
+ if node_mask is not None:
463
+ x_ = x
464
+ x, dh = tuple_index(x, node_mask), tuple_index(dh, node_mask)
465
+
466
+ x = self.norm[0](tuple_sum(x, self.dropout[0](dh)))
467
+
468
+ dh = self.ff_func(x)
469
+ x = self.norm[1](tuple_sum(x, self.dropout[1](dh)))
470
+
471
+ if node_mask is not None:
472
+ x_[0][node_mask], x_[1][node_mask] = x[0], x[1]
473
+ x = x_
474
+
475
+ return x, edge_attr
esm/source/esm/inverse_folding/gvp_transformer.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ from typing import Any, Dict, List, Optional, Tuple, NamedTuple
8
+ import torch
9
+ from torch import nn
10
+ from torch import Tensor
11
+ import torch.nn.functional as F
12
+ from scipy.spatial import transform
13
+
14
+ from esm.data import Alphabet
15
+
16
+ from .features import DihedralFeatures
17
+ from .gvp_encoder import GVPEncoder
18
+ from .gvp_utils import unflatten_graph
19
+ from .gvp_transformer_encoder import GVPTransformerEncoder
20
+ from .transformer_decoder import TransformerDecoder
21
+ from .util import rotate, CoordBatchConverter
22
+
23
+
24
+ class GVPTransformerModel(nn.Module):
25
+ """
26
+ GVP-Transformer inverse folding model.
27
+
28
+ Architecture: Geometric GVP-GNN as initial layers, followed by
29
+ sequence-to-sequence Transformer encoder and decoder.
30
+ """
31
+
32
+ def __init__(self, args, alphabet):
33
+ super().__init__()
34
+ encoder_embed_tokens = self.build_embedding(
35
+ args, alphabet, args.encoder_embed_dim,
36
+ )
37
+ decoder_embed_tokens = self.build_embedding(
38
+ args, alphabet, args.decoder_embed_dim,
39
+ )
40
+ encoder = self.build_encoder(args, alphabet, encoder_embed_tokens)
41
+ decoder = self.build_decoder(args, alphabet, decoder_embed_tokens)
42
+ self.args = args
43
+ self.encoder = encoder
44
+ self.decoder = decoder
45
+
46
+ @classmethod
47
+ def build_encoder(cls, args, src_dict, embed_tokens):
48
+ encoder = GVPTransformerEncoder(args, src_dict, embed_tokens)
49
+ return encoder
50
+
51
+ @classmethod
52
+ def build_decoder(cls, args, tgt_dict, embed_tokens):
53
+ decoder = TransformerDecoder(
54
+ args,
55
+ tgt_dict,
56
+ embed_tokens,
57
+ )
58
+ return decoder
59
+
60
+ @classmethod
61
+ def build_embedding(cls, args, dictionary, embed_dim):
62
+ num_embeddings = len(dictionary)
63
+ padding_idx = dictionary.padding_idx
64
+ emb = nn.Embedding(num_embeddings, embed_dim, padding_idx)
65
+ nn.init.normal_(emb.weight, mean=0, std=embed_dim ** -0.5)
66
+ nn.init.constant_(emb.weight[padding_idx], 0)
67
+ return emb
68
+
69
+ def forward(
70
+ self,
71
+ coords,
72
+ padding_mask,
73
+ confidence,
74
+ prev_output_tokens,
75
+ return_all_hiddens: bool = False,
76
+ features_only: bool = False,
77
+ ):
78
+ encoder_out = self.encoder(coords, padding_mask, confidence,
79
+ return_all_hiddens=return_all_hiddens)
80
+ logits, extra = self.decoder(
81
+ prev_output_tokens,
82
+ encoder_out=encoder_out,
83
+ features_only=features_only,
84
+ return_all_hiddens=return_all_hiddens,
85
+ )
86
+ return logits, extra
87
+
88
+ def sample(self, coords, partial_seq=None, temperature=1.0, confidence=None, device=None):
89
+ """
90
+ Samples sequences based on multinomial sampling (no beam search).
91
+
92
+ Args:
93
+ coords: L x 3 x 3 list representing one backbone
94
+ partial_seq: Optional, partial sequence with mask tokens if part of
95
+ the sequence is known
96
+ temperature: sampling temperature, use low temperature for higher
97
+ sequence recovery and high temperature for higher diversity
98
+ confidence: optional length L list of confidence scores for coordinates
99
+ """
100
+ L = len(coords)
101
+ # Convert to batch format
102
+ batch_converter = CoordBatchConverter(self.decoder.dictionary)
103
+ batch_coords, confidence, _, _, padding_mask = (
104
+ batch_converter([(coords, confidence, None)], device=device)
105
+ )
106
+
107
+ # Start with prepend token
108
+ mask_idx = self.decoder.dictionary.get_idx('<mask>')
109
+ sampled_tokens = torch.full((1, 1+L), mask_idx, dtype=int)
110
+ sampled_tokens[0, 0] = self.decoder.dictionary.get_idx('<cath>')
111
+ if partial_seq is not None:
112
+ for i, c in enumerate(partial_seq):
113
+ sampled_tokens[0, i+1] = self.decoder.dictionary.get_idx(c)
114
+
115
+ # Save incremental states for faster sampling
116
+ incremental_state = dict()
117
+
118
+ # Run encoder only once
119
+ encoder_out = self.encoder(batch_coords, padding_mask, confidence)
120
+
121
+ # Make sure all tensors are on the same device if a GPU is present
122
+ if device:
123
+ sampled_tokens = sampled_tokens.to(device)
124
+
125
+ # Decode one token at a time
126
+ for i in range(1, L+1):
127
+ logits, _ = self.decoder(
128
+ sampled_tokens[:, :i],
129
+ encoder_out,
130
+ incremental_state=incremental_state,
131
+ )
132
+ logits = logits[0].transpose(0, 1)
133
+ logits /= temperature
134
+ probs = F.softmax(logits, dim=-1)
135
+ if sampled_tokens[0, i] == mask_idx:
136
+ sampled_tokens[:, i] = torch.multinomial(probs, 1).squeeze(-1)
137
+ sampled_seq = sampled_tokens[0, 1:]
138
+
139
+ # Convert back to string via lookup
140
+ return ''.join([self.decoder.dictionary.get_tok(a) for a in sampled_seq])
esm/source/esm/inverse_folding/gvp_transformer_encoder.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # Contents of this file were adapted from the open source fairseq repository.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ import argparse
9
+ import math
10
+ from typing import Dict, List, Optional
11
+
12
+ import torch
13
+ import torch.nn as nn
14
+ from torch import Tensor
15
+
16
+ from esm.modules import SinusoidalPositionalEmbedding
17
+ from .features import GVPInputFeaturizer, DihedralFeatures
18
+ from .gvp_encoder import GVPEncoder
19
+ from .transformer_layer import TransformerEncoderLayer
20
+ from .util import nan_to_num, get_rotation_frames, rotate, rbf
21
+
22
+
23
+ class GVPTransformerEncoder(nn.Module):
24
+ """
25
+ Transformer encoder consisting of *args.encoder.layers* layers. Each layer
26
+ is a :class:`TransformerEncoderLayer`.
27
+
28
+ Args:
29
+ args (argparse.Namespace): parsed command-line arguments
30
+ dictionary (~fairseq.data.Dictionary): encoding dictionary
31
+ embed_tokens (torch.nn.Embedding): input embedding
32
+ """
33
+
34
+ def __init__(self, args, dictionary, embed_tokens):
35
+ super().__init__()
36
+ self.args = args
37
+ self.dictionary = dictionary
38
+
39
+ self.dropout_module = nn.Dropout(args.dropout)
40
+
41
+ embed_dim = embed_tokens.embedding_dim
42
+ self.padding_idx = embed_tokens.padding_idx
43
+
44
+ self.embed_tokens = embed_tokens
45
+ self.embed_scale = math.sqrt(embed_dim)
46
+ self.embed_positions = SinusoidalPositionalEmbedding(
47
+ embed_dim,
48
+ self.padding_idx,
49
+ )
50
+ self.embed_gvp_input_features = nn.Linear(15, embed_dim)
51
+ self.embed_confidence = nn.Linear(16, embed_dim)
52
+ self.embed_dihedrals = DihedralFeatures(embed_dim)
53
+
54
+ gvp_args = argparse.Namespace()
55
+ for k, v in vars(args).items():
56
+ if k.startswith("gvp_"):
57
+ setattr(gvp_args, k[4:], v)
58
+ self.gvp_encoder = GVPEncoder(gvp_args)
59
+ gvp_out_dim = gvp_args.node_hidden_dim_scalar + (3 *
60
+ gvp_args.node_hidden_dim_vector)
61
+ self.embed_gvp_output = nn.Linear(gvp_out_dim, embed_dim)
62
+
63
+ self.layers = nn.ModuleList([])
64
+ self.layers.extend(
65
+ [self.build_encoder_layer(args) for i in range(args.encoder_layers)]
66
+ )
67
+ self.num_layers = len(self.layers)
68
+ self.layer_norm = nn.LayerNorm(embed_dim)
69
+
70
+ def build_encoder_layer(self, args):
71
+ return TransformerEncoderLayer(args)
72
+
73
+ def forward_embedding(self, coords, padding_mask, confidence):
74
+ """
75
+ Args:
76
+ coords: N, CA, C backbone coordinates in shape length x 3 (atoms) x 3
77
+ padding_mask: boolean Tensor (true for padding) of shape length
78
+ confidence: confidence scores between 0 and 1 of shape length
79
+ """
80
+ components = dict()
81
+ coord_mask = torch.all(torch.all(torch.isfinite(coords), dim=-1), dim=-1)
82
+ coords = nan_to_num(coords)
83
+ mask_tokens = (
84
+ padding_mask * self.dictionary.padding_idx +
85
+ ~padding_mask * self.dictionary.get_idx("<mask>")
86
+ )
87
+ components["tokens"] = self.embed_tokens(mask_tokens) * self.embed_scale
88
+ components["diherals"] = self.embed_dihedrals(coords)
89
+
90
+ # GVP encoder
91
+ gvp_out_scalars, gvp_out_vectors = self.gvp_encoder(coords,
92
+ coord_mask, padding_mask, confidence)
93
+ R = get_rotation_frames(coords)
94
+ # Rotate to local rotation frame for rotation-invariance
95
+ gvp_out_features = torch.cat([
96
+ gvp_out_scalars,
97
+ rotate(gvp_out_vectors, R.transpose(-2, -1)).flatten(-2, -1),
98
+ ], dim=-1)
99
+ components["gvp_out"] = self.embed_gvp_output(gvp_out_features)
100
+
101
+ components["confidence"] = self.embed_confidence(
102
+ rbf(confidence, 0., 1.))
103
+
104
+ # In addition to GVP encoder outputs, also directly embed GVP input node
105
+ # features to the Transformer
106
+ scalar_features, vector_features = GVPInputFeaturizer.get_node_features(
107
+ coords, coord_mask, with_coord_mask=False)
108
+ features = torch.cat([
109
+ scalar_features,
110
+ rotate(vector_features, R.transpose(-2, -1)).flatten(-2, -1),
111
+ ], dim=-1)
112
+ components["gvp_input_features"] = self.embed_gvp_input_features(features)
113
+
114
+ embed = sum(components.values())
115
+ # for k, v in components.items():
116
+ # print(k, torch.mean(v, dim=(0,1)), torch.std(v, dim=(0,1)))
117
+
118
+ x = embed
119
+ x = x + self.embed_positions(mask_tokens)
120
+ x = self.dropout_module(x)
121
+ return x, components
122
+
123
+ def forward(
124
+ self,
125
+ coords,
126
+ encoder_padding_mask,
127
+ confidence,
128
+ return_all_hiddens: bool = False,
129
+ ):
130
+ """
131
+ Args:
132
+ coords (Tensor): backbone coordinates
133
+ shape batch_size x num_residues x num_atoms (3 for N, CA, C) x 3
134
+ encoder_padding_mask (ByteTensor): the positions of
135
+ padding elements of shape `(batch_size x num_residues)`
136
+ confidence (Tensor): the confidence score of shape (batch_size x
137
+ num_residues). The value is between 0. and 1. for each residue
138
+ coordinate, or -1. if no coordinate is given
139
+ return_all_hiddens (bool, optional): also return all of the
140
+ intermediate hidden states (default: False).
141
+
142
+ Returns:
143
+ dict:
144
+ - **encoder_out** (Tensor): the last encoder layer's output of
145
+ shape `(num_residues, batch_size, embed_dim)`
146
+ - **encoder_padding_mask** (ByteTensor): the positions of
147
+ padding elements of shape `(batch_size, num_residues)`
148
+ - **encoder_embedding** (Tensor): the (scaled) embedding lookup
149
+ of shape `(batch_size, num_residues, embed_dim)`
150
+ - **encoder_states** (List[Tensor]): all intermediate
151
+ hidden states of shape `(num_residues, batch_size, embed_dim)`.
152
+ Only populated if *return_all_hiddens* is True.
153
+ """
154
+ x, encoder_embedding = self.forward_embedding(coords,
155
+ encoder_padding_mask, confidence)
156
+ # account for padding while computing the representation
157
+ x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x))
158
+
159
+ # B x T x C -> T x B x C
160
+ x = x.transpose(0, 1)
161
+
162
+ encoder_states = []
163
+
164
+ if return_all_hiddens:
165
+ encoder_states.append(x)
166
+
167
+ # encoder layers
168
+ for layer in self.layers:
169
+ x = layer(
170
+ x, encoder_padding_mask=encoder_padding_mask
171
+ )
172
+ if return_all_hiddens:
173
+ assert encoder_states is not None
174
+ encoder_states.append(x)
175
+
176
+ if self.layer_norm is not None:
177
+ x = self.layer_norm(x)
178
+
179
+ return {
180
+ "encoder_out": [x], # T x B x C
181
+ "encoder_padding_mask": [encoder_padding_mask], # B x T
182
+ "encoder_embedding": [encoder_embedding], # dictionary
183
+ "encoder_states": encoder_states, # List[T x B x C]
184
+ }
esm/source/esm/inverse_folding/gvp_utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+
8
+
9
+ def flatten_graph(node_embeddings, edge_embeddings, edge_index):
10
+ """
11
+ Flattens the graph into a batch size one (with disconnected subgraphs for
12
+ each example) to be compatible with pytorch-geometric package.
13
+ Args:
14
+ node_embeddings: node embeddings in tuple form (scalar, vector)
15
+ - scalar: shape batch size x nodes x node_embed_dim
16
+ - vector: shape batch size x nodes x node_embed_dim x 3
17
+ edge_embeddings: edge embeddings of in tuple form (scalar, vector)
18
+ - scalar: shape batch size x edges x edge_embed_dim
19
+ - vector: shape batch size x edges x edge_embed_dim x 3
20
+ edge_index: shape batch_size x 2 (source node and target node) x edges
21
+ Returns:
22
+ node_embeddings: node embeddings in tuple form (scalar, vector)
23
+ - scalar: shape batch total_nodes x node_embed_dim
24
+ - vector: shape batch total_nodes x node_embed_dim x 3
25
+ edge_embeddings: edge embeddings of in tuple form (scalar, vector)
26
+ - scalar: shape batch total_edges x edge_embed_dim
27
+ - vector: shape batch total_edges x edge_embed_dim x 3
28
+ edge_index: shape 2 x total_edges
29
+ """
30
+ x_s, x_v = node_embeddings
31
+ e_s, e_v = edge_embeddings
32
+ batch_size, N = x_s.shape[0], x_s.shape[1]
33
+ node_embeddings = (torch.flatten(x_s, 0, 1), torch.flatten(x_v, 0, 1))
34
+ edge_embeddings = (torch.flatten(e_s, 0, 1), torch.flatten(e_v, 0, 1))
35
+
36
+ edge_mask = torch.any(edge_index != -1, dim=1)
37
+ # Re-number the nodes by adding batch_idx * N to each batch
38
+ edge_index = edge_index + (torch.arange(batch_size, device=edge_index.device) *
39
+ N).unsqueeze(-1).unsqueeze(-1)
40
+ edge_index = edge_index.permute(1, 0, 2).flatten(1, 2)
41
+ edge_mask = edge_mask.flatten()
42
+ edge_index = edge_index[:, edge_mask]
43
+ edge_embeddings = (
44
+ edge_embeddings[0][edge_mask, :],
45
+ edge_embeddings[1][edge_mask, :]
46
+ )
47
+ return node_embeddings, edge_embeddings, edge_index
48
+
49
+
50
+ def unflatten_graph(node_embeddings, batch_size):
51
+ """
52
+ Unflattens node embeddings.
53
+ Args:
54
+ node_embeddings: node embeddings in tuple form (scalar, vector)
55
+ - scalar: shape batch total_nodes x node_embed_dim
56
+ - vector: shape batch total_nodes x node_embed_dim x 3
57
+ batch_size: int
58
+ Returns:
59
+ node_embeddings: node embeddings in tuple form (scalar, vector)
60
+ - scalar: shape batch size x nodes x node_embed_dim
61
+ - vector: shape batch size x nodes x node_embed_dim x 3
62
+ """
63
+ x_s, x_v = node_embeddings
64
+ x_s = x_s.reshape(batch_size, -1, x_s.shape[1])
65
+ x_v = x_v.reshape(batch_size, -1, x_v.shape[1], x_v.shape[2])
66
+ return (x_s, x_v)
67
+
68
+
esm/source/esm/inverse_folding/multichain_util.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import biotite.structure
7
+ import numpy as np
8
+ import torch
9
+ from typing import Sequence, Tuple, List
10
+
11
+ from esm.inverse_folding.util import (
12
+ load_structure,
13
+ extract_coords_from_structure,
14
+ load_coords,
15
+ get_sequence_loss,
16
+ get_encoder_output,
17
+ )
18
+
19
+
20
+ def extract_coords_from_complex(structure: biotite.structure.AtomArray):
21
+ """
22
+ Args:
23
+ structure: biotite AtomArray
24
+ Returns:
25
+ Tuple (coords_list, seq_list)
26
+ - coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
27
+ coordinates representing the backbone of each chain
28
+ - seqs: Dictionary mapping chain ids to native sequences of each chain
29
+ """
30
+ coords = {}
31
+ seqs = {}
32
+ all_chains = biotite.structure.get_chains(structure)
33
+ for chain_id in all_chains:
34
+ chain = structure[structure.chain_id == chain_id]
35
+ coords[chain_id], seqs[chain_id] = extract_coords_from_structure(chain)
36
+ return coords, seqs
37
+
38
+
39
+ def load_complex_coords(fpath, chains):
40
+ """
41
+ Args:
42
+ fpath: filepath to either pdb or cif file
43
+ chains: the chain ids (the order matters for autoregressive model)
44
+ Returns:
45
+ Tuple (coords_list, seq_list)
46
+ - coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
47
+ coordinates representing the backbone of each chain
48
+ - seqs: Dictionary mapping chain ids to native sequences of each chain
49
+ """
50
+ structure = load_structure(fpath, chains)
51
+ return extract_coords_from_complex(structure)
52
+
53
+
54
+ def _concatenate_coords(coords, target_chain_id, padding_length=10):
55
+ """
56
+ Args:
57
+ coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
58
+ coordinates representing the backbone of each chain
59
+ target_chain_id: The chain id to sample sequences for
60
+ padding_length: Length of padding between concatenated chains
61
+ Returns:
62
+ Tuple (coords, seq)
63
+ - coords is an L x 3 x 3 array for N, CA, C coordinates, a
64
+ concatenation of the chains with padding in between
65
+ - seq is the extracted sequence, with padding tokens inserted
66
+ between the concatenated chains
67
+ """
68
+ pad_coords = np.full((padding_length, 3, 3), np.nan, dtype=np.float32)
69
+ # For best performance, put the target chain first in concatenation.
70
+ coords_list = [coords[target_chain_id]]
71
+ for chain_id in coords:
72
+ if chain_id == target_chain_id:
73
+ continue
74
+ coords_list.append(pad_coords)
75
+ coords_list.append(coords[chain_id])
76
+ coords_concatenated = np.concatenate(coords_list, axis=0)
77
+ return coords_concatenated
78
+
79
+
80
+ def sample_sequence_in_complex(model, coords, target_chain_id, temperature=1.,
81
+ padding_length=10):
82
+ """
83
+ Samples sequence for one chain in a complex.
84
+ Args:
85
+ model: An instance of the GVPTransformer model
86
+ coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
87
+ coordinates representing the backbone of each chain
88
+ target_chain_id: The chain id to sample sequences for
89
+ padding_length: padding length in between chains
90
+ Returns:
91
+ Sampled sequence for the target chain
92
+ """
93
+ target_chain_len = coords[target_chain_id].shape[0]
94
+ all_coords = _concatenate_coords(coords, target_chain_id)
95
+ device = next(model.parameters()).device
96
+
97
+ # Supply padding tokens for other chains to avoid unused sampling for speed
98
+ padding_pattern = ['<pad>'] * all_coords.shape[0]
99
+ for i in range(target_chain_len):
100
+ padding_pattern[i] = '<mask>'
101
+ sampled = model.sample(all_coords, partial_seq=padding_pattern,
102
+ temperature=temperature, device=device)
103
+ sampled = sampled[:target_chain_len]
104
+ return sampled
105
+
106
+
107
+ def score_sequence_in_complex(model, alphabet, coords, target_chain_id,
108
+ target_seq, padding_length=10):
109
+ """
110
+ Scores sequence for one chain in a complex.
111
+ Args:
112
+ model: An instance of the GVPTransformer model
113
+ alphabet: Alphabet for the model
114
+ coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
115
+ coordinates representing the backbone of each chain
116
+ target_chain_id: The chain id to sample sequences for
117
+ target_seq: Target sequence for the target chain for scoring.
118
+ padding_length: padding length in between chains
119
+ Returns:
120
+ Tuple (ll_fullseq, ll_withcoord)
121
+ - ll_fullseq: Average log-likelihood over the full target chain
122
+ - ll_withcoord: Average log-likelihood in target chain excluding those
123
+ residues without coordinates
124
+ """
125
+ all_coords = _concatenate_coords(coords, target_chain_id)
126
+
127
+ loss, target_padding_mask = get_sequence_loss(model, alphabet, all_coords,
128
+ target_seq)
129
+ ll_fullseq = -np.sum(loss * ~target_padding_mask) / np.sum(
130
+ ~target_padding_mask)
131
+
132
+ # Also calculate average when excluding masked portions
133
+ coord_mask = np.all(np.isfinite(coords[target_chain_id]), axis=(-1, -2))
134
+ ll_withcoord = -np.sum(loss * coord_mask) / np.sum(coord_mask)
135
+ return ll_fullseq, ll_withcoord
136
+
137
+
138
+ def get_encoder_output_for_complex(model, alphabet, coords, target_chain_id):
139
+ """
140
+ Args:
141
+ model: An instance of the GVPTransformer model
142
+ alphabet: Alphabet for the model
143
+ coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
144
+ coordinates representing the backbone of each chain
145
+ target_chain_id: The chain id to sample sequences for
146
+ Returns:
147
+ Dictionary mapping chain id to encoder output for each chain
148
+ """
149
+ all_coords = _concatenate_coords(coords, target_chain_id)
150
+ all_rep = get_encoder_output(model, alphabet, all_coords)
151
+ target_chain_len = coords[target_chain_id].shape[0]
152
+ return all_rep[:target_chain_len]
esm/source/esm/inverse_folding/transformer_decoder.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # Contents of this file were adapted from the open source fairseq repository.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ import math
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch import Tensor
14
+
15
+ from esm.modules import SinusoidalPositionalEmbedding
16
+ from .transformer_layer import TransformerDecoderLayer
17
+
18
+
19
+ def fill_with_neg_inf(t):
20
+ """FP16-compatible function that fills a tensor with -inf."""
21
+ return t.float().fill_(float("-inf")).type_as(t)
22
+
23
+
24
+ class TransformerDecoder(nn.Module):
25
+ """
26
+ Transformer decoder consisting of *args.decoder.layers* layers. Each layer
27
+ is a :class:`TransformerDecoderLayer`.
28
+
29
+ Args:
30
+ args (argparse.Namespace): parsed command-line arguments
31
+ dictionary (~fairseq.data.Dictionary): decoding dictionary
32
+ embed_tokens (torch.nn.Embedding): output embedding
33
+ no_encoder_attn (bool, optional): whether to attend to encoder outputs
34
+ (default: False).
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ args,
40
+ dictionary,
41
+ embed_tokens,
42
+ ):
43
+ super().__init__()
44
+ self.args = args
45
+ self.dictionary = dictionary
46
+ self._future_mask = torch.empty(0)
47
+
48
+ self.dropout_module = nn.Dropout(args.dropout)
49
+
50
+ input_embed_dim = embed_tokens.embedding_dim
51
+ embed_dim = args.decoder_embed_dim
52
+ self.embed_dim = embed_dim
53
+
54
+ self.padding_idx = embed_tokens.padding_idx
55
+
56
+ self.embed_tokens = embed_tokens
57
+ self.embed_scale = math.sqrt(embed_dim)
58
+
59
+ self.project_in_dim = (
60
+ nn.Linear(input_embed_dim, embed_dim, bias=False)
61
+ if embed_dim != input_embed_dim
62
+ else None
63
+ )
64
+ self.embed_positions = SinusoidalPositionalEmbedding(
65
+ embed_dim,
66
+ self.padding_idx,
67
+ )
68
+
69
+ self.layers = nn.ModuleList([])
70
+ self.layers.extend(
71
+ [
72
+ self.build_decoder_layer(args)
73
+ for _ in range(args.decoder_layers)
74
+ ]
75
+ )
76
+ self.num_layers = len(self.layers)
77
+ self.layer_norm = nn.LayerNorm(embed_dim)
78
+
79
+ self.build_output_projection(args, dictionary)
80
+
81
+ def build_output_projection(self, args, dictionary):
82
+ self.output_projection = nn.Linear(
83
+ args.decoder_embed_dim, len(dictionary), bias=False
84
+ )
85
+ nn.init.normal_(
86
+ self.output_projection.weight, mean=0, std=args.decoder_embed_dim ** -0.5
87
+ )
88
+
89
+ def build_decoder_layer(self, args):
90
+ return TransformerDecoderLayer(args)
91
+
92
+ def forward(
93
+ self,
94
+ prev_output_tokens,
95
+ encoder_out: Optional[Dict[str, List[Tensor]]] = None,
96
+ incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
97
+ features_only: bool = False,
98
+ return_all_hiddens: bool = False,
99
+ ):
100
+ """
101
+ Args:
102
+ prev_output_tokens (LongTensor): previous decoder outputs of shape
103
+ `(batch, tgt_len)`, for teacher forcing
104
+ encoder_out (optional): output from the encoder, used for
105
+ encoder-side attention, should be of size T x B x C
106
+ incremental_state (dict): dictionary used for storing state during
107
+ :ref:`Incremental decoding`
108
+ features_only (bool, optional): only return features without
109
+ applying output layer (default: False).
110
+
111
+ Returns:
112
+ tuple:
113
+ - the decoder's output of shape `(batch, tgt_len, vocab)`
114
+ - a dictionary with any model-specific outputs
115
+ """
116
+
117
+ x, extra = self.extract_features(
118
+ prev_output_tokens,
119
+ encoder_out=encoder_out,
120
+ incremental_state=incremental_state,
121
+ )
122
+
123
+ if not features_only:
124
+ x = self.output_layer(x)
125
+ x = x.transpose(1, 2) # B x T x C -> B x C x T
126
+ return x, extra
127
+
128
+ def extract_features(
129
+ self,
130
+ prev_output_tokens,
131
+ encoder_out: Optional[Dict[str, List[Tensor]]],
132
+ incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
133
+ ):
134
+ """
135
+ Similar to *forward* but only return features.
136
+
137
+ Includes several features from "Jointly Learning to Align and
138
+ Translate with Transformer Models" (Garg et al., EMNLP 2019).
139
+
140
+ Returns:
141
+ tuple:
142
+ - the decoder's features of shape `(batch, tgt_len, embed_dim)`
143
+ - a dictionary with any model-specific outputs
144
+ """
145
+ bs, slen = prev_output_tokens.size()
146
+
147
+ enc: Optional[Tensor] = None
148
+ padding_mask: Optional[Tensor] = None
149
+ if encoder_out is not None and len(encoder_out["encoder_out"]) > 0:
150
+ enc = encoder_out["encoder_out"][0]
151
+ assert (
152
+ enc.size()[1] == bs
153
+ ), f"Expected enc.shape == (t, {bs}, c) got {enc.shape}"
154
+ if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0:
155
+ padding_mask = encoder_out["encoder_padding_mask"][0]
156
+
157
+ # embed positions
158
+ positions = self.embed_positions(
159
+ prev_output_tokens
160
+ )
161
+
162
+ if incremental_state is not None:
163
+ prev_output_tokens = prev_output_tokens[:, -1:]
164
+ positions = positions[:, -1:]
165
+
166
+ # embed tokens and positions
167
+ x = self.embed_scale * self.embed_tokens(prev_output_tokens)
168
+
169
+ if self.project_in_dim is not None:
170
+ x = self.project_in_dim(x)
171
+
172
+ x += positions
173
+
174
+ x = self.dropout_module(x)
175
+
176
+ # B x T x C -> T x B x C
177
+ x = x.transpose(0, 1)
178
+
179
+ self_attn_padding_mask: Optional[Tensor] = None
180
+ if prev_output_tokens.eq(self.padding_idx).any():
181
+ self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
182
+
183
+ # decoder layers
184
+ attn: Optional[Tensor] = None
185
+ inner_states: List[Optional[Tensor]] = [x]
186
+ for idx, layer in enumerate(self.layers):
187
+ if incremental_state is None:
188
+ self_attn_mask = self.buffered_future_mask(x)
189
+ else:
190
+ self_attn_mask = None
191
+
192
+ x, layer_attn, _ = layer(
193
+ x,
194
+ enc,
195
+ padding_mask,
196
+ incremental_state,
197
+ self_attn_mask=self_attn_mask,
198
+ self_attn_padding_mask=self_attn_padding_mask,
199
+ need_attn=False,
200
+ need_head_weights=False,
201
+ )
202
+ inner_states.append(x)
203
+
204
+ if self.layer_norm is not None:
205
+ x = self.layer_norm(x)
206
+
207
+ # T x B x C -> B x C x T
208
+ x = x.transpose(0, 1)
209
+
210
+ return x, {"inner_states": inner_states}
211
+
212
+ def output_layer(self, features):
213
+ """Project features to the vocabulary size."""
214
+ return self.output_projection(features)
215
+
216
+ def buffered_future_mask(self, tensor):
217
+ dim = tensor.size(0)
218
+ # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
219
+ if (
220
+ self._future_mask.size(0) == 0
221
+ or (not self._future_mask.device == tensor.device)
222
+ or self._future_mask.size(0) < dim
223
+ ):
224
+ self._future_mask = torch.triu(
225
+ fill_with_neg_inf(torch.zeros([dim, dim])), 1
226
+ )
227
+ self._future_mask = self._future_mask.to(tensor)
228
+ return self._future_mask[:dim, :dim]