Mirror of github.com/kouroshSA/ppiGPLM + MED4 ckpt_7e checkpoint and sidecar note
Browse files- .gitattributes +2 -0
- .gitignore +34 -0
- LES-wrapper.md +160 -0
- LES-wrapper.py +662 -0
- LICENSE +39 -0
- MED4-PPIs-low-confidence_ppiGPLM_prompts.csv +0 -0
- MED4_100_PRS.csv +101 -0
- MED4_100_RRS.csv +101 -0
- README.md +195 -0
- assets/MANUSCRIPT_NUMBERS_TO_VERIFY.md +7 -0
- assets/ppiGPLM.png +3 -0
- assets/tri_model_consensus.png +3 -0
- assets/tri_model_consensus.svg +149 -0
- checkpoints/ppiGPLM_ckpt_7e.md +30 -0
- checkpoints/ppiGPLM_ckpt_7e.pt +3 -0
- config/finetune_label3.py +29 -0
- config/train_par_gpt2-s_scratch.py +49 -0
- configurator.py +47 -0
- data/MED4_char/meta.pkl +3 -0
- data/MED4_char/prepare.py +68 -0
- data/MED4_char/readme.md +15 -0
- model.py +330 -0
- requirements.txt +7 -0
- roc_analysis_color_threshold_F1e.py +210 -0
- sample_fasta3.3_softmax_error_handling3e.py +164 -0
- train_.py +333 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/ppiGPLM.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/tri_model_consensus.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model checkpoints (too large for GitHub)
|
| 2 |
+
out/ckpt.pt
|
| 3 |
+
*.pt
|
| 4 |
+
# Released checkpoints under checkpoints/ are tracked (on Hugging Face)
|
| 5 |
+
!checkpoints/*.pt
|
| 6 |
+
!checkpoints/*.pth
|
| 7 |
+
|
| 8 |
+
# Large data files
|
| 9 |
+
data/*/input.txt
|
| 10 |
+
data/*/input.csv
|
| 11 |
+
data/*/train.bin
|
| 12 |
+
data/*/val.bin
|
| 13 |
+
data/*/*.csv
|
| 14 |
+
|
| 15 |
+
# Large prompt files
|
| 16 |
+
MED4-PPIs-low-confidence_ppiGPLM_cleaned2_prompts.csv
|
| 17 |
+
MED4-PPIs-low-confidence_ppiGPLM_cleaned2_prompts.txt
|
| 18 |
+
|
| 19 |
+
# Python cache
|
| 20 |
+
__pycache__/
|
| 21 |
+
*.pyc
|
| 22 |
+
|
| 23 |
+
# Output files
|
| 24 |
+
*_output.csv
|
| 25 |
+
*_probabilities.csv
|
| 26 |
+
*_classifications.txt
|
| 27 |
+
LES_results*/
|
| 28 |
+
|
| 29 |
+
# OS files
|
| 30 |
+
.DS_Store
|
| 31 |
+
Thumbs.db
|
| 32 |
+
|
| 33 |
+
# Wandb logs
|
| 34 |
+
wandb/
|
LES-wrapper.md
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LES-wrapper: Learning Efficiency Score Evaluation
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The **LES-wrapper** automates the evaluation of model trainability across multiple
|
| 6 |
+
training checkpoints. It runs inference on PRS (Positive Reference Set) and RRS
|
| 7 |
+
(Random Reference Set) datasets at each checkpoint, computes ROC metrics, and
|
| 8 |
+
derives integrated learning efficiency scores.
|
| 9 |
+
|
| 10 |
+
## What is LES?
|
| 11 |
+
|
| 12 |
+
LES (Learning Efficiency Score) is defined as the **area under the metric-vs-iteration
|
| 13 |
+
curve**. Unlike metrics that measure only final performance, LES captures the entire
|
| 14 |
+
learning trajectory:
|
| 15 |
+
|
| 16 |
+
- **LES-AUC**: Area under the AUC trajectory curve
|
| 17 |
+
- **LES-F1**: Area under the Best-F1 trajectory curve
|
| 18 |
+
- **LES-Threshold**: Area under the optimal-threshold trajectory curve
|
| 19 |
+
|
| 20 |
+
Higher LES values indicate faster learning, better overall performance across training,
|
| 21 |
+
and more efficient use of training iterations.
|
| 22 |
+
|
| 23 |
+
## Workflow
|
| 24 |
+
|
| 25 |
+
For each checkpoint the wrapper:
|
| 26 |
+
|
| 27 |
+
1. Runs inference on PRS and RRS prompt files
|
| 28 |
+
2. Extracts softmax probabilities for the positive class
|
| 29 |
+
3. Combines probabilities into a single file for ROC analysis
|
| 30 |
+
4. Computes AUC, Best-F1, and optimal threshold
|
| 31 |
+
5. Generates a color-coded ROC curve plot
|
| 32 |
+
6. Aggregates results into a summary table
|
| 33 |
+
7. Plots metric trajectories across checkpoints
|
| 34 |
+
8. Computes LES values for each metric
|
| 35 |
+
|
| 36 |
+
## Installation
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
conda activate gpt
|
| 40 |
+
pip install scikit-learn matplotlib numpy
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Basic Usage
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
python LES-wrapper.py \
|
| 47 |
+
--checkpoint_dir out_ppiGPLM_MED4 \
|
| 48 |
+
--prs_file MED4_Int_100pairs_prompts.txt \
|
| 49 |
+
--rrs_file MED4_100_RND_prompts.txt \
|
| 50 |
+
--output_dir LES_results_MED4 \
|
| 51 |
+
--vanilla
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
The `--vanilla` flag is required when evaluating standard ppiGPLM checkpoints
|
| 55 |
+
(i.e., checkpoints trained with `train_.py` rather than a HOPE-variant trainer).
|
| 56 |
+
|
| 57 |
+
## Common Patterns
|
| 58 |
+
|
| 59 |
+
### Selecting Specific Checkpoints
|
| 60 |
+
|
| 61 |
+
```bash
|
| 62 |
+
# Only checkpoints at iterations 1000, 2000, and 5000
|
| 63 |
+
python LES-wrapper.py \
|
| 64 |
+
--checkpoint_dir out_ppiGPLM_MED4 \
|
| 65 |
+
--prs_file prs.txt \
|
| 66 |
+
--rrs_file rrs.txt \
|
| 67 |
+
--output_dir results \
|
| 68 |
+
--checkpoint_pattern "ckpt_[125]000.pt" \
|
| 69 |
+
--vanilla
|
| 70 |
+
|
| 71 |
+
# Every 5000 iterations
|
| 72 |
+
python LES-wrapper.py \
|
| 73 |
+
--checkpoint_dir out_ppiGPLM_MED4 \
|
| 74 |
+
--prs_file prs.txt \
|
| 75 |
+
--rrs_file rrs.txt \
|
| 76 |
+
--output_dir results \
|
| 77 |
+
--checkpoint_pattern "ckpt_*000.pt" \
|
| 78 |
+
--vanilla
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### Skipping Inference (Re-computing Metrics Only)
|
| 82 |
+
|
| 83 |
+
If you have already run inference and just want to recompute metrics or plots:
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
python LES-wrapper.py \
|
| 87 |
+
--checkpoint_dir out_ppiGPLM_MED4 \
|
| 88 |
+
--prs_file MED4_Int_100pairs_prompts.txt \
|
| 89 |
+
--rrs_file MED4_100_RND_prompts.txt \
|
| 90 |
+
--output_dir LES_results_MED4 \
|
| 91 |
+
--skip_inference \
|
| 92 |
+
--vanilla
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
### Plot Customization
|
| 96 |
+
|
| 97 |
+
Use `--no_plots` to skip trajectory figure generation when you only need the
|
| 98 |
+
summary CSV.
|
| 99 |
+
|
| 100 |
+
## Command-Line Arguments (Vanilla Flags)
|
| 101 |
+
|
| 102 |
+
| Argument | Default | Description |
|
| 103 |
+
|----------|---------|-------------|
|
| 104 |
+
| `--checkpoint_dir` | *(required)* | Directory containing checkpoint files (`ckpt_*.pt`) |
|
| 105 |
+
| `--prs_file` | *(required)* | Path to Positive Reference Set prompts file |
|
| 106 |
+
| `--rrs_file` | *(required)* | Path to Random Reference Set prompts file |
|
| 107 |
+
| `--output_dir` | `LES_results` | Directory for all output files |
|
| 108 |
+
| `--checkpoint_pattern` | `ckpt_*.pt` | Glob pattern to select checkpoints |
|
| 109 |
+
| `--include_final` | False | Also evaluate `ckpt.pt` (the final checkpoint) |
|
| 110 |
+
| `--no_plots` | False | Skip generating trajectory plots |
|
| 111 |
+
| `--skip_inference` | False | Skip inference; reuse existing probability files |
|
| 112 |
+
| `--vanilla` | False | Use standard GPT checkpoint format (required for ppiGPLM) |
|
| 113 |
+
|
| 114 |
+
## Output Structure
|
| 115 |
+
|
| 116 |
+
```
|
| 117 |
+
LES_results/
|
| 118 |
+
├── ckpt_1000/
|
| 119 |
+
│ ├── PRS_iter1000_probabilities.csv
|
| 120 |
+
│ ├── PRS_iter1000_classifications.txt
|
| 121 |
+
│ ├── RRS_iter1000_probabilities.csv
|
| 122 |
+
│ ├── RRS_iter1000_classifications.txt
|
| 123 |
+
│ ├── combined_probabilities_iter1000.csv
|
| 124 |
+
│ ├── ROC_iter1000.png
|
| 125 |
+
│ └── inference_log.md
|
| 126 |
+
├── ckpt_2000/ ...
|
| 127 |
+
├── trajectory_AUC.png
|
| 128 |
+
├── trajectory_F1.png
|
| 129 |
+
├── trajectory_Threshold.png
|
| 130 |
+
├── trajectory_combined.png
|
| 131 |
+
├── summary_table.csv
|
| 132 |
+
└── manifest.json
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
`summary_table.csv` contains per-checkpoint metrics plus a final row with the
|
| 136 |
+
integrated LES values. `manifest.json` records complete run metadata.
|
| 137 |
+
|
| 138 |
+
---
|
| 139 |
+
|
| 140 |
+
## Appendix: HOPE/Titan Checkpoint Support (not used for vanilla ppiGPLM)
|
| 141 |
+
|
| 142 |
+
The script also supports checkpoints from HOPE/Titan-variant trainers. These flags
|
| 143 |
+
are no-ops when `--vanilla` is passed, so they do not affect standard ppiGPLM runs.
|
| 144 |
+
|
| 145 |
+
| Flag | Description |
|
| 146 |
+
|------|-------------|
|
| 147 |
+
| `--use_titan_in_forward` | Override Titan memory-in-forward flag (-1 = use checkpoint value) |
|
| 148 |
+
| `--enable_surprise_updates` | Enable Titan surprise-based memory updates (0/1) |
|
| 149 |
+
| `--surprise_update_in_eval` | Allow memory updates during evaluation (0/1) |
|
| 150 |
+
| `--adapt_mode` | Prefix-adaptation mode: `none` or `prefix` |
|
| 151 |
+
| `--adapt_steps` | Number of adaptation steps or teaching epochs |
|
| 152 |
+
| `--memory_state_in` | Path to a saved memory-only state file |
|
| 153 |
+
| `--teach_file` | CSV of supervised teaching pairs for pre-evaluation conditioning |
|
| 154 |
+
| `--teach_delim` | Delimiter for the teaching CSV (default `\|`) |
|
| 155 |
+
| `--teach_has_header` | Whether the teaching CSV has a header row (0/1) |
|
| 156 |
+
| `--teach_reset_policy` | Memory reset policy during teaching: `pair`, `file`, or `none` |
|
| 157 |
+
| `--teach_shuffle` | Shuffle teaching examples each epoch (0/1) |
|
| 158 |
+
| `--teach_max_rows` | Limit teaching rows; 0 = use all |
|
| 159 |
+
|
| 160 |
+
These flags are relevant to the HOPE project (if/when it has its own public repo).
|
LES-wrapper.py
ADDED
|
@@ -0,0 +1,662 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
LES-wrapper.py — Learning Efficiency Score evaluation across training checkpoints.
|
| 4 |
+
|
| 5 |
+
Runs inference on PRS (Positive Reference Set) and RRS (Random Reference Set)
|
| 6 |
+
prompts at every saved checkpoint in a directory, computes ROC-AUC, optimal-F1
|
| 7 |
+
threshold, and Best-F1 at each checkpoint, then integrates these into a single
|
| 8 |
+
Learning Efficiency Score (LES) per metric — the area under the
|
| 9 |
+
metric-vs-iteration curve.
|
| 10 |
+
|
| 11 |
+
Note: This script supports both vanilla GPT checkpoints (use --vanilla) and
|
| 12 |
+
HOPE/Titan checkpoints (the --use_titan_in_forward, --enable_surprise_updates,
|
| 13 |
+
--adapt_mode, --teach_* flags). When evaluating ppiGPLM models, use --vanilla;
|
| 14 |
+
the HOPE-specific flags are no-ops for vanilla checkpoints.
|
| 15 |
+
|
| 16 |
+
Basic usage:
|
| 17 |
+
python LES-wrapper.py \\
|
| 18 |
+
--checkpoint_dir <dir> \\
|
| 19 |
+
--prs_file <prs.txt> \\
|
| 20 |
+
--rrs_file <rrs.txt> \\
|
| 21 |
+
--output_dir <out> \\
|
| 22 |
+
--vanilla
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
import os
|
| 26 |
+
import sys
|
| 27 |
+
import re
|
| 28 |
+
import glob
|
| 29 |
+
import argparse
|
| 30 |
+
import subprocess
|
| 31 |
+
import csv
|
| 32 |
+
import json
|
| 33 |
+
from datetime import datetime
|
| 34 |
+
import numpy as np
|
| 35 |
+
import matplotlib.pyplot as plt
|
| 36 |
+
from sklearn.metrics import roc_curve, auc, f1_score
|
| 37 |
+
|
| 38 |
+
# -----------------------------------------------------------------------------
|
| 39 |
+
# Parse command-line arguments
|
| 40 |
+
# -----------------------------------------------------------------------------
|
| 41 |
+
def parse_args():
|
| 42 |
+
parser = argparse.ArgumentParser(
|
| 43 |
+
description='LES-wrapper: Learning Efficiency Score evaluation across checkpoints',
|
| 44 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 45 |
+
epilog="""
|
| 46 |
+
Examples:
|
| 47 |
+
python LES-wrapper.py --checkpoint_dir out-model --prs_file prs.txt --rrs_file rrs.txt --output_dir results
|
| 48 |
+
python LES-wrapper.py --checkpoint_dir out-model --prs_file prs.txt --rrs_file rrs.txt --use_titan_in_forward=1
|
| 49 |
+
"""
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Required arguments
|
| 53 |
+
parser.add_argument('--checkpoint_dir', type=str, required=True,
|
| 54 |
+
help='Directory containing model checkpoints (ckpt_*.pt files)')
|
| 55 |
+
parser.add_argument('--prs_file', type=str, required=True,
|
| 56 |
+
help='Path to Positive Reference Set prompts file')
|
| 57 |
+
parser.add_argument('--rrs_file', type=str, required=True,
|
| 58 |
+
help='Path to Random Reference Set prompts file')
|
| 59 |
+
|
| 60 |
+
# Output configuration
|
| 61 |
+
parser.add_argument('--output_dir', type=str, default='LES_results',
|
| 62 |
+
help='Directory to save all outputs (default: LES_results)')
|
| 63 |
+
|
| 64 |
+
# Checkpoint pattern
|
| 65 |
+
parser.add_argument('--checkpoint_pattern', type=str, default='ckpt_*.pt',
|
| 66 |
+
help='Pattern to match checkpoint files (default: ckpt_*.pt)')
|
| 67 |
+
|
| 68 |
+
# Include final checkpoint
|
| 69 |
+
parser.add_argument('--include_final', action='store_true',
|
| 70 |
+
help='Also include ckpt.pt (final checkpoint) if present')
|
| 71 |
+
|
| 72 |
+
# Titan/HOPE arguments (passed to sample script)
|
| 73 |
+
parser.add_argument('--use_titan_in_forward', type=int, default=-1,
|
| 74 |
+
help='Override use_titan_in_forward (-1=use checkpoint value)')
|
| 75 |
+
parser.add_argument('--enable_surprise_updates', type=int, default=0,
|
| 76 |
+
help='Enable Titan surprise updates (0/1)')
|
| 77 |
+
parser.add_argument('--surprise_update_in_eval', type=int, default=0,
|
| 78 |
+
help='Allow memory updates during eval (0/1)')
|
| 79 |
+
parser.add_argument('--adapt_mode', type=str, default='none',
|
| 80 |
+
choices=['none', 'prefix'], help='Adaptation mode')
|
| 81 |
+
parser.add_argument('--adapt_steps', type=int, default=0,
|
| 82 |
+
help='Number of adaptation steps (0=disabled). When --teach_file is provided, this means teaching epochs.')
|
| 83 |
+
|
| 84 |
+
# Memory state
|
| 85 |
+
parser.add_argument('--memory_state_in', type=str, default='',
|
| 86 |
+
help='Path to load memory-only state file')
|
| 87 |
+
|
| 88 |
+
# Teaching file arguments
|
| 89 |
+
parser.add_argument('--teach_file', type=str, default='',
|
| 90 |
+
help='Path to teaching CSV file for supervised adaptation')
|
| 91 |
+
parser.add_argument('--teach_delim', type=str, default='|',
|
| 92 |
+
help='Delimiter for teaching CSV (default: |)')
|
| 93 |
+
parser.add_argument('--teach_has_header', type=int, default=1,
|
| 94 |
+
help='Whether teaching CSV has header row (default: 1)')
|
| 95 |
+
parser.add_argument('--teach_reset_policy', type=str, default='pair',
|
| 96 |
+
choices=['pair', 'file', 'none'],
|
| 97 |
+
help='Memory reset policy during teaching')
|
| 98 |
+
parser.add_argument('--teach_shuffle', type=int, default=1,
|
| 99 |
+
help='Shuffle teaching examples each epoch (default: 1)')
|
| 100 |
+
parser.add_argument('--teach_max_rows', type=int, default=0,
|
| 101 |
+
help='Limit number of teaching rows loaded (0=all)')
|
| 102 |
+
|
| 103 |
+
# Parallel processing
|
| 104 |
+
parser.add_argument('--skip_inference', action='store_true',
|
| 105 |
+
help='Skip inference step (use existing probability files)')
|
| 106 |
+
|
| 107 |
+
# Plotting options
|
| 108 |
+
parser.add_argument('--no_plots', action='store_true',
|
| 109 |
+
help='Skip generating trajectory plots')
|
| 110 |
+
|
| 111 |
+
# Vanilla mode (use standard GPT model without HOPE features)
|
| 112 |
+
parser.add_argument('--vanilla', action='store_true',
|
| 113 |
+
help='Use vanilla GPT model (no HOPE/CMS/Titan features)')
|
| 114 |
+
|
| 115 |
+
return parser.parse_known_args()
|
| 116 |
+
|
| 117 |
+
# -----------------------------------------------------------------------------
|
| 118 |
+
# Helper functions
|
| 119 |
+
# -----------------------------------------------------------------------------
|
| 120 |
+
def extract_iteration_from_checkpoint(ckpt_name):
|
| 121 |
+
"""Extract iteration number from checkpoint filename."""
|
| 122 |
+
# Match patterns like ckpt_1000.pt, ckpt_iter_1000.pt, etc.
|
| 123 |
+
match = re.search(r'ckpt_?(?:iter_)?(\d+)\.pt$', ckpt_name)
|
| 124 |
+
if match:
|
| 125 |
+
return int(match.group(1))
|
| 126 |
+
# For ckpt.pt (final), return a large number
|
| 127 |
+
if ckpt_name == 'ckpt.pt':
|
| 128 |
+
return float('inf')
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
def get_checkpoints(checkpoint_dir, pattern, include_final=False):
|
| 132 |
+
"""Get sorted list of checkpoint files with their iterations."""
|
| 133 |
+
ckpt_files = glob.glob(os.path.join(checkpoint_dir, pattern))
|
| 134 |
+
|
| 135 |
+
if include_final:
|
| 136 |
+
final_ckpt = os.path.join(checkpoint_dir, 'ckpt.pt')
|
| 137 |
+
if os.path.exists(final_ckpt) and final_ckpt not in ckpt_files:
|
| 138 |
+
ckpt_files.append(final_ckpt)
|
| 139 |
+
|
| 140 |
+
checkpoints = []
|
| 141 |
+
for ckpt_path in ckpt_files:
|
| 142 |
+
ckpt_name = os.path.basename(ckpt_path)
|
| 143 |
+
iteration = extract_iteration_from_checkpoint(ckpt_name)
|
| 144 |
+
if iteration is not None:
|
| 145 |
+
checkpoints.append((ckpt_name, iteration, ckpt_path))
|
| 146 |
+
|
| 147 |
+
# Sort by iteration
|
| 148 |
+
checkpoints.sort(key=lambda x: x[1])
|
| 149 |
+
return checkpoints
|
| 150 |
+
|
| 151 |
+
def run_inference(sample_script, model_dir, ckpt_name, input_file, output_dir,
|
| 152 |
+
output_prefix, extra_args, vanilla=False):
|
| 153 |
+
"""Run inference using the sample script.
|
| 154 |
+
|
| 155 |
+
Both vanilla and HOPE models use the same sample script (sample_fasta3.3_softmax_error_handling3e_hope_v3.py)
|
| 156 |
+
since models trained with train_hope_v3.py --vanilla use the same checkpoint format.
|
| 157 |
+
The vanilla flag just controls whether HOPE-specific args are passed.
|
| 158 |
+
"""
|
| 159 |
+
cmd = [
|
| 160 |
+
sys.executable, sample_script,
|
| 161 |
+
'--input_file', input_file,
|
| 162 |
+
'--output_dir', output_dir,
|
| 163 |
+
'--output_prefix', output_prefix,
|
| 164 |
+
'--model_dir', model_dir,
|
| 165 |
+
'--ckpt_name', ckpt_name,
|
| 166 |
+
] + extra_args
|
| 167 |
+
|
| 168 |
+
print(f" Running: {' '.join(cmd[:8])}...")
|
| 169 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 170 |
+
|
| 171 |
+
if result.returncode != 0:
|
| 172 |
+
print(f" ERROR: Inference failed for {ckpt_name}")
|
| 173 |
+
print(f" stderr: {result.stderr[:500]}")
|
| 174 |
+
return False
|
| 175 |
+
|
| 176 |
+
return True
|
| 177 |
+
|
| 178 |
+
def extract_probabilities_from_csv(csv_path):
|
| 179 |
+
"""Extract probability of '1' from inference output CSV.
|
| 180 |
+
|
| 181 |
+
Note: The prompts may contain commas (e.g., '<ps1>,SEQ,<ps2>,SEQ,<')
|
| 182 |
+
so the probability columns are at the end of each row, not at fixed positions.
|
| 183 |
+
The format is: Prompt, Probability_of_1, Probability_of_0
|
| 184 |
+
But due to commas in prompts, probabilities are in columns [-2] and [-1].
|
| 185 |
+
"""
|
| 186 |
+
probabilities = []
|
| 187 |
+
if not os.path.exists(csv_path):
|
| 188 |
+
print(f" WARNING: File not found: {csv_path}")
|
| 189 |
+
return probabilities
|
| 190 |
+
|
| 191 |
+
with open(csv_path, 'r') as f:
|
| 192 |
+
reader = csv.reader(f)
|
| 193 |
+
header = next(reader, None) # Skip header
|
| 194 |
+
for row in reader:
|
| 195 |
+
if len(row) >= 2:
|
| 196 |
+
# Probabilities are at the END of the row due to commas in prompts
|
| 197 |
+
# Second-to-last column is Probability_of_1
|
| 198 |
+
try:
|
| 199 |
+
prob_1 = float(row[-2]) # Use negative indexing to get 2nd from end
|
| 200 |
+
probabilities.append(prob_1)
|
| 201 |
+
except (ValueError, IndexError):
|
| 202 |
+
continue
|
| 203 |
+
return probabilities
|
| 204 |
+
|
| 205 |
+
def combine_probabilities(prs_probs, rrs_probs, output_path):
|
| 206 |
+
"""Combine PRS and RRS probabilities into a single CSV for ROC analysis."""
|
| 207 |
+
max_len = max(len(prs_probs), len(rrs_probs))
|
| 208 |
+
|
| 209 |
+
with open(output_path, 'w', newline='') as f:
|
| 210 |
+
writer = csv.writer(f)
|
| 211 |
+
# No header - roc.py expects raw values
|
| 212 |
+
for i in range(max_len):
|
| 213 |
+
prs_val = prs_probs[i] if i < len(prs_probs) else ''
|
| 214 |
+
rrs_val = rrs_probs[i] if i < len(rrs_probs) else ''
|
| 215 |
+
writer.writerow([prs_val, rrs_val])
|
| 216 |
+
|
| 217 |
+
return output_path
|
| 218 |
+
|
| 219 |
+
def run_roc_analysis_internal(combined_csv_path, output_plot_path):
|
| 220 |
+
"""Run ROC analysis and return metrics (internal implementation)."""
|
| 221 |
+
# Read probabilities
|
| 222 |
+
prs_probs = []
|
| 223 |
+
rrs_probs = []
|
| 224 |
+
|
| 225 |
+
with open(combined_csv_path, 'r') as f:
|
| 226 |
+
reader = csv.reader(f)
|
| 227 |
+
for row in reader:
|
| 228 |
+
if len(row) >= 2:
|
| 229 |
+
prs_val = row[0].strip()
|
| 230 |
+
rrs_val = row[1].strip()
|
| 231 |
+
if prs_val:
|
| 232 |
+
prs_probs.append(float(prs_val))
|
| 233 |
+
if rrs_val:
|
| 234 |
+
rrs_probs.append(float(rrs_val))
|
| 235 |
+
|
| 236 |
+
if not prs_probs or not rrs_probs:
|
| 237 |
+
return None, None, None
|
| 238 |
+
|
| 239 |
+
# Assign labels (PRS = 1, RRS = 0)
|
| 240 |
+
prs_labels = [1] * len(prs_probs)
|
| 241 |
+
rrs_labels = [0] * len(rrs_probs)
|
| 242 |
+
|
| 243 |
+
probs = np.array(prs_probs + rrs_probs)
|
| 244 |
+
labels = np.array(prs_labels + rrs_labels)
|
| 245 |
+
|
| 246 |
+
# Compute ROC curve and AUC
|
| 247 |
+
fpr, tpr, thresholds = roc_curve(labels, probs)
|
| 248 |
+
roc_auc = auc(fpr, tpr)
|
| 249 |
+
|
| 250 |
+
# Filter valid thresholds
|
| 251 |
+
finite_idxs = np.where(np.isfinite(thresholds))[0]
|
| 252 |
+
fpr = fpr[finite_idxs]
|
| 253 |
+
tpr = tpr[finite_idxs]
|
| 254 |
+
thresholds = thresholds[finite_idxs]
|
| 255 |
+
|
| 256 |
+
valid_thresholds_idxs = np.where((thresholds >= 0) & (thresholds <= 1))[0]
|
| 257 |
+
fpr = fpr[valid_thresholds_idxs]
|
| 258 |
+
tpr = tpr[valid_thresholds_idxs]
|
| 259 |
+
thresholds = thresholds[valid_thresholds_idxs]
|
| 260 |
+
|
| 261 |
+
# Compute best F1 score
|
| 262 |
+
best_f1 = -1.0
|
| 263 |
+
best_thresh = None
|
| 264 |
+
for thresh in thresholds:
|
| 265 |
+
predicted_labels = (probs >= thresh).astype(int)
|
| 266 |
+
current_f1 = f1_score(labels, predicted_labels)
|
| 267 |
+
if current_f1 > best_f1:
|
| 268 |
+
best_f1 = current_f1
|
| 269 |
+
best_thresh = thresh
|
| 270 |
+
|
| 271 |
+
# Generate ROC plot using figure and axes approach (like original roc.py)
|
| 272 |
+
fig, ax = plt.subplots(figsize=(10, 8))
|
| 273 |
+
plt.rcParams['font.family'] = 'DejaVu Sans' # More portable than Arial
|
| 274 |
+
|
| 275 |
+
# Color-coded by threshold
|
| 276 |
+
norm = plt.Normalize(vmin=thresholds.min(), vmax=thresholds.max())
|
| 277 |
+
cmap = plt.cm.viridis
|
| 278 |
+
|
| 279 |
+
for i in range(len(fpr) - 1):
|
| 280 |
+
x = fpr[i:i + 2]
|
| 281 |
+
y = tpr[i:i + 2]
|
| 282 |
+
z = thresholds[i]
|
| 283 |
+
ax.plot(x, y, color=cmap(norm(z)), lw=2.5)
|
| 284 |
+
|
| 285 |
+
ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
|
| 286 |
+
|
| 287 |
+
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
|
| 288 |
+
sm.set_array([])
|
| 289 |
+
cbar = fig.colorbar(sm, ax=ax)
|
| 290 |
+
cbar.set_label('Threshold', fontsize=14)
|
| 291 |
+
|
| 292 |
+
ax.set_xlim([0.0, 1.0])
|
| 293 |
+
ax.set_ylim([0.0, 1.05])
|
| 294 |
+
ax.set_xlabel('False Positive Rate', fontsize=14)
|
| 295 |
+
ax.set_ylabel('True Positive Rate', fontsize=14)
|
| 296 |
+
ax.set_title('ROC Curve', fontsize=16)
|
| 297 |
+
|
| 298 |
+
legend_text = f'AUC = {roc_auc:.3f}, Best F1 = {best_f1:.3f}, Threshold = {best_thresh:.3f}'
|
| 299 |
+
ax.legend([legend_text], loc="lower right", fontsize=11)
|
| 300 |
+
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
|
| 301 |
+
|
| 302 |
+
plt.tight_layout()
|
| 303 |
+
plt.savefig(output_plot_path, dpi=150, format='png')
|
| 304 |
+
plt.close(fig)
|
| 305 |
+
|
| 306 |
+
return roc_auc, best_f1, best_thresh
|
| 307 |
+
|
| 308 |
+
def compute_les(iterations, values):
|
| 309 |
+
"""Compute Learning Efficiency Score (area under curve)."""
|
| 310 |
+
if len(iterations) < 2 or len(values) < 2:
|
| 311 |
+
return 0.0
|
| 312 |
+
|
| 313 |
+
# Use numpy trapezoid integration
|
| 314 |
+
# Normalize iterations to [0, 1] for comparable LES across different training lengths
|
| 315 |
+
iters = np.array(iterations, dtype=float)
|
| 316 |
+
vals = np.array(values, dtype=float)
|
| 317 |
+
|
| 318 |
+
# Remove any inf iterations (final checkpoint)
|
| 319 |
+
valid_mask = np.isfinite(iters)
|
| 320 |
+
iters = iters[valid_mask]
|
| 321 |
+
vals = vals[valid_mask]
|
| 322 |
+
|
| 323 |
+
if len(iters) < 2:
|
| 324 |
+
return 0.0
|
| 325 |
+
|
| 326 |
+
# Normalize iterations
|
| 327 |
+
iters_normalized = (iters - iters.min()) / (iters.max() - iters.min())
|
| 328 |
+
|
| 329 |
+
# Compute area under curve using trapezoidal rule
|
| 330 |
+
les = np.trapezoid(vals, iters_normalized)
|
| 331 |
+
|
| 332 |
+
return les
|
| 333 |
+
|
| 334 |
+
def plot_metric_trajectory(iterations, values, metric_name, output_path, les_value):
|
| 335 |
+
"""Plot a single metric trajectory across checkpoints."""
|
| 336 |
+
plt.figure(figsize=(10, 6))
|
| 337 |
+
|
| 338 |
+
# Filter out inf iterations
|
| 339 |
+
valid_mask = [i < float('inf') for i in iterations]
|
| 340 |
+
plot_iters = [it for it, v in zip(iterations, valid_mask) if v]
|
| 341 |
+
plot_vals = [val for val, v in zip(values, valid_mask) if v]
|
| 342 |
+
|
| 343 |
+
plt.plot(plot_iters, plot_vals, 'bo-', linewidth=2, markersize=8)
|
| 344 |
+
plt.fill_between(plot_iters, plot_vals, alpha=0.3)
|
| 345 |
+
|
| 346 |
+
plt.xlabel('Training Iteration', fontsize=14)
|
| 347 |
+
plt.ylabel(metric_name, fontsize=14)
|
| 348 |
+
plt.title(f'{metric_name} vs Training Iteration\nLES-{metric_name} = {les_value:.4f}', fontsize=14)
|
| 349 |
+
plt.grid(True, linestyle='--', alpha=0.7)
|
| 350 |
+
|
| 351 |
+
# Add value annotations
|
| 352 |
+
for i, (it, val) in enumerate(zip(plot_iters, plot_vals)):
|
| 353 |
+
if i % max(1, len(plot_iters) // 10) == 0: # Annotate every ~10% of points
|
| 354 |
+
plt.annotate(f'{val:.3f}', (it, val), textcoords="offset points",
|
| 355 |
+
xytext=(0, 10), ha='center', fontsize=9)
|
| 356 |
+
|
| 357 |
+
plt.tight_layout()
|
| 358 |
+
plt.savefig(output_path, dpi=150)
|
| 359 |
+
plt.close()
|
| 360 |
+
|
| 361 |
+
def plot_combined_trajectories(iterations, auc_vals, f1_vals, thresh_vals, output_path, les_values):
|
| 362 |
+
"""Plot all metrics on a single figure."""
|
| 363 |
+
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
|
| 364 |
+
|
| 365 |
+
# Filter out inf iterations
|
| 366 |
+
valid_mask = [i < float('inf') for i in iterations]
|
| 367 |
+
plot_iters = [it for it, v in zip(iterations, valid_mask) if v]
|
| 368 |
+
plot_auc = [val for val, v in zip(auc_vals, valid_mask) if v]
|
| 369 |
+
plot_f1 = [val for val, v in zip(f1_vals, valid_mask) if v]
|
| 370 |
+
plot_thresh = [val for val, v in zip(thresh_vals, valid_mask) if v]
|
| 371 |
+
|
| 372 |
+
# AUC plot
|
| 373 |
+
axes[0].plot(plot_iters, plot_auc, 'bo-', linewidth=2, markersize=6)
|
| 374 |
+
axes[0].fill_between(plot_iters, plot_auc, alpha=0.3)
|
| 375 |
+
axes[0].set_xlabel('Training Iteration')
|
| 376 |
+
axes[0].set_ylabel('AUC')
|
| 377 |
+
axes[0].set_title(f'AUC Trajectory\nLES-AUC = {les_values["AUC"]:.4f}')
|
| 378 |
+
axes[0].grid(True, linestyle='--', alpha=0.7)
|
| 379 |
+
axes[0].set_ylim([0, 1.05])
|
| 380 |
+
|
| 381 |
+
# F1 plot
|
| 382 |
+
axes[1].plot(plot_iters, plot_f1, 'go-', linewidth=2, markersize=6)
|
| 383 |
+
axes[1].fill_between(plot_iters, plot_f1, alpha=0.3, color='green')
|
| 384 |
+
axes[1].set_xlabel('Training Iteration')
|
| 385 |
+
axes[1].set_ylabel('Best F1')
|
| 386 |
+
axes[1].set_title(f'Best F1 Trajectory\nLES-F1 = {les_values["F1"]:.4f}')
|
| 387 |
+
axes[1].grid(True, linestyle='--', alpha=0.7)
|
| 388 |
+
axes[1].set_ylim([0, 1.05])
|
| 389 |
+
|
| 390 |
+
# Threshold plot
|
| 391 |
+
axes[2].plot(plot_iters, plot_thresh, 'ro-', linewidth=2, markersize=6)
|
| 392 |
+
axes[2].fill_between(plot_iters, plot_thresh, alpha=0.3, color='red')
|
| 393 |
+
axes[2].set_xlabel('Training Iteration')
|
| 394 |
+
axes[2].set_ylabel('Best F1 Threshold')
|
| 395 |
+
axes[2].set_title(f'Threshold Trajectory\nLES-Threshold = {les_values["Threshold"]:.4f}')
|
| 396 |
+
axes[2].grid(True, linestyle='--', alpha=0.7)
|
| 397 |
+
axes[2].set_ylim([0, 1.05])
|
| 398 |
+
|
| 399 |
+
plt.tight_layout()
|
| 400 |
+
plt.savefig(output_path, dpi=150)
|
| 401 |
+
plt.close()
|
| 402 |
+
|
| 403 |
+
# -----------------------------------------------------------------------------
|
| 404 |
+
# Main execution
|
| 405 |
+
# -----------------------------------------------------------------------------
|
| 406 |
+
def main():
|
| 407 |
+
args, extra_args = parse_args()
|
| 408 |
+
|
| 409 |
+
# Validate inputs
|
| 410 |
+
if not os.path.exists(args.checkpoint_dir):
|
| 411 |
+
print(f"ERROR: Checkpoint directory not found: {args.checkpoint_dir}")
|
| 412 |
+
sys.exit(1)
|
| 413 |
+
|
| 414 |
+
if not os.path.exists(args.prs_file):
|
| 415 |
+
print(f"ERROR: PRS file not found: {args.prs_file}")
|
| 416 |
+
sys.exit(1)
|
| 417 |
+
|
| 418 |
+
if not os.path.exists(args.rrs_file):
|
| 419 |
+
print(f"ERROR: RRS file not found: {args.rrs_file}")
|
| 420 |
+
sys.exit(1)
|
| 421 |
+
|
| 422 |
+
# Create output directory
|
| 423 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 424 |
+
|
| 425 |
+
# Find sample script
|
| 426 |
+
# Note: Both vanilla and HOPE models trained with train_hope_v3.py use the same
|
| 427 |
+
# checkpoint format, so we always use the HOPE sample script. For vanilla models,
|
| 428 |
+
# we just skip the HOPE-specific arguments (Titan, surprise updates, etc.)
|
| 429 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 430 |
+
sample_script = os.path.join(script_dir, 'sample_fasta3.3_softmax_error_handling3e_hope_v3.py')
|
| 431 |
+
model_type = "Vanilla GPT (trained with train_hope_v3.py)" if args.vanilla else "HOPE"
|
| 432 |
+
|
| 433 |
+
if not os.path.exists(sample_script):
|
| 434 |
+
print(f"ERROR: Sample script not found: {sample_script}")
|
| 435 |
+
sys.exit(1)
|
| 436 |
+
|
| 437 |
+
# Get checkpoints
|
| 438 |
+
checkpoints = get_checkpoints(args.checkpoint_dir, args.checkpoint_pattern, args.include_final)
|
| 439 |
+
|
| 440 |
+
if not checkpoints:
|
| 441 |
+
print(f"ERROR: No checkpoints found matching pattern '{args.checkpoint_pattern}' in {args.checkpoint_dir}")
|
| 442 |
+
sys.exit(1)
|
| 443 |
+
|
| 444 |
+
print(f"\n{'='*60}")
|
| 445 |
+
print("LES-wrapper: Learning Efficiency Score Evaluation")
|
| 446 |
+
print(f"{'='*60}")
|
| 447 |
+
print(f"Model type: {model_type}")
|
| 448 |
+
print(f"Checkpoint directory: {args.checkpoint_dir}")
|
| 449 |
+
print(f"PRS file: {args.prs_file}")
|
| 450 |
+
print(f"RRS file: {args.rrs_file}")
|
| 451 |
+
print(f"Output directory: {args.output_dir}")
|
| 452 |
+
print(f"Found {len(checkpoints)} checkpoints")
|
| 453 |
+
print(f"{'='*60}\n")
|
| 454 |
+
|
| 455 |
+
# Build extra args for sample script (only for HOPE models)
|
| 456 |
+
sample_extra_args = []
|
| 457 |
+
if not args.vanilla:
|
| 458 |
+
# HOPE-specific arguments
|
| 459 |
+
if args.use_titan_in_forward >= 0:
|
| 460 |
+
sample_extra_args.extend(['--use_titan_in_forward', str(args.use_titan_in_forward)])
|
| 461 |
+
if args.enable_surprise_updates:
|
| 462 |
+
sample_extra_args.extend(['--enable_surprise_updates', str(args.enable_surprise_updates)])
|
| 463 |
+
if args.surprise_update_in_eval:
|
| 464 |
+
sample_extra_args.extend(['--surprise_update_in_eval', str(args.surprise_update_in_eval)])
|
| 465 |
+
if args.adapt_mode != 'none':
|
| 466 |
+
sample_extra_args.extend(['--adapt_mode', args.adapt_mode])
|
| 467 |
+
if args.adapt_steps > 0:
|
| 468 |
+
sample_extra_args.extend(['--adapt_steps', str(args.adapt_steps)])
|
| 469 |
+
if args.memory_state_in:
|
| 470 |
+
sample_extra_args.extend(['--memory_state_in', args.memory_state_in])
|
| 471 |
+
|
| 472 |
+
# Teaching file arguments
|
| 473 |
+
if args.teach_file:
|
| 474 |
+
sample_extra_args.extend(['--teach_file', args.teach_file])
|
| 475 |
+
sample_extra_args.extend(['--teach_delim', args.teach_delim])
|
| 476 |
+
sample_extra_args.extend(['--teach_has_header', str(args.teach_has_header)])
|
| 477 |
+
sample_extra_args.extend(['--teach_reset_policy', args.teach_reset_policy])
|
| 478 |
+
sample_extra_args.extend(['--teach_shuffle', str(args.teach_shuffle)])
|
| 479 |
+
if args.teach_max_rows > 0:
|
| 480 |
+
sample_extra_args.extend(['--teach_max_rows', str(args.teach_max_rows)])
|
| 481 |
+
|
| 482 |
+
# Add any extra arguments passed through
|
| 483 |
+
sample_extra_args.extend(extra_args)
|
| 484 |
+
|
| 485 |
+
# Results storage
|
| 486 |
+
results = []
|
| 487 |
+
iterations = []
|
| 488 |
+
auc_values = []
|
| 489 |
+
f1_values = []
|
| 490 |
+
thresh_values = []
|
| 491 |
+
|
| 492 |
+
# Process each checkpoint
|
| 493 |
+
for idx, (ckpt_name, iteration, ckpt_path) in enumerate(checkpoints):
|
| 494 |
+
iter_str = str(iteration) if iteration < float('inf') else 'final'
|
| 495 |
+
print(f"\n[{idx+1}/{len(checkpoints)}] Processing checkpoint: {ckpt_name} (iteration {iter_str})")
|
| 496 |
+
|
| 497 |
+
# Create subdirectory for this checkpoint
|
| 498 |
+
ckpt_subdir = os.path.join(args.output_dir, f"ckpt_{iter_str}")
|
| 499 |
+
os.makedirs(ckpt_subdir, exist_ok=True)
|
| 500 |
+
|
| 501 |
+
prs_prefix = f"PRS_iter{iter_str}"
|
| 502 |
+
rrs_prefix = f"RRS_iter{iter_str}"
|
| 503 |
+
|
| 504 |
+
prs_csv = os.path.join(ckpt_subdir, f"{prs_prefix}_probabilities.csv")
|
| 505 |
+
rrs_csv = os.path.join(ckpt_subdir, f"{rrs_prefix}_probabilities.csv")
|
| 506 |
+
|
| 507 |
+
if not args.skip_inference:
|
| 508 |
+
# Run inference for PRS
|
| 509 |
+
print(f" Running PRS inference...")
|
| 510 |
+
if not run_inference(sample_script, args.checkpoint_dir, ckpt_name,
|
| 511 |
+
args.prs_file, ckpt_subdir, prs_prefix, sample_extra_args,
|
| 512 |
+
vanilla=args.vanilla):
|
| 513 |
+
print(f" SKIPPING checkpoint due to inference error")
|
| 514 |
+
continue
|
| 515 |
+
|
| 516 |
+
# Run inference for RRS
|
| 517 |
+
print(f" Running RRS inference...")
|
| 518 |
+
if not run_inference(sample_script, args.checkpoint_dir, ckpt_name,
|
| 519 |
+
args.rrs_file, ckpt_subdir, rrs_prefix, sample_extra_args,
|
| 520 |
+
vanilla=args.vanilla):
|
| 521 |
+
print(f" SKIPPING checkpoint due to inference error")
|
| 522 |
+
continue
|
| 523 |
+
|
| 524 |
+
# Extract probabilities
|
| 525 |
+
print(f" Extracting probabilities...")
|
| 526 |
+
prs_probs = extract_probabilities_from_csv(prs_csv)
|
| 527 |
+
rrs_probs = extract_probabilities_from_csv(rrs_csv)
|
| 528 |
+
|
| 529 |
+
if not prs_probs or not rrs_probs:
|
| 530 |
+
print(f" WARNING: Could not extract probabilities, skipping")
|
| 531 |
+
continue
|
| 532 |
+
|
| 533 |
+
print(f" PRS samples: {len(prs_probs)}, RRS samples: {len(rrs_probs)}")
|
| 534 |
+
|
| 535 |
+
# Combine probabilities
|
| 536 |
+
combined_csv = os.path.join(ckpt_subdir, f"combined_probabilities_iter{iter_str}.csv")
|
| 537 |
+
combine_probabilities(prs_probs, rrs_probs, combined_csv)
|
| 538 |
+
|
| 539 |
+
# Run ROC analysis
|
| 540 |
+
print(f" Running ROC analysis...")
|
| 541 |
+
roc_plot = os.path.join(ckpt_subdir, f"ROC_iter{iter_str}.png")
|
| 542 |
+
roc_auc, best_f1, best_thresh = run_roc_analysis_internal(combined_csv, roc_plot)
|
| 543 |
+
|
| 544 |
+
if roc_auc is None:
|
| 545 |
+
print(f" WARNING: ROC analysis failed, skipping")
|
| 546 |
+
continue
|
| 547 |
+
|
| 548 |
+
print(f" Results: AUC={roc_auc:.4f}, F1={best_f1:.4f}, Threshold={best_thresh:.4f}")
|
| 549 |
+
|
| 550 |
+
# Store results
|
| 551 |
+
results.append({
|
| 552 |
+
'checkpoint': ckpt_name,
|
| 553 |
+
'iteration': iteration if iteration < float('inf') else 'final',
|
| 554 |
+
'AUC': roc_auc,
|
| 555 |
+
'Best_F1': best_f1,
|
| 556 |
+
'Best_F1_Threshold': best_thresh,
|
| 557 |
+
'PRS_samples': len(prs_probs),
|
| 558 |
+
'RRS_samples': len(rrs_probs)
|
| 559 |
+
})
|
| 560 |
+
|
| 561 |
+
iterations.append(iteration)
|
| 562 |
+
auc_values.append(roc_auc)
|
| 563 |
+
f1_values.append(best_f1)
|
| 564 |
+
thresh_values.append(best_thresh)
|
| 565 |
+
|
| 566 |
+
# Compute LES values
|
| 567 |
+
print(f"\n{'='*60}")
|
| 568 |
+
print("Computing Learning Efficiency Scores (LES)")
|
| 569 |
+
print(f"{'='*60}")
|
| 570 |
+
|
| 571 |
+
les_auc = compute_les(iterations, auc_values)
|
| 572 |
+
les_f1 = compute_les(iterations, f1_values)
|
| 573 |
+
les_thresh = compute_les(iterations, thresh_values)
|
| 574 |
+
|
| 575 |
+
les_values = {
|
| 576 |
+
'AUC': les_auc,
|
| 577 |
+
'F1': les_f1,
|
| 578 |
+
'Threshold': les_thresh
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
print(f" LES-AUC: {les_auc:.6f}")
|
| 582 |
+
print(f" LES-F1: {les_f1:.6f}")
|
| 583 |
+
print(f" LES-Threshold: {les_thresh:.6f}")
|
| 584 |
+
|
| 585 |
+
# Generate trajectory plots
|
| 586 |
+
if not args.no_plots and len(iterations) >= 2:
|
| 587 |
+
print(f"\nGenerating trajectory plots...")
|
| 588 |
+
|
| 589 |
+
# Individual plots
|
| 590 |
+
plot_metric_trajectory(iterations, auc_values, 'AUC',
|
| 591 |
+
os.path.join(args.output_dir, 'trajectory_AUC.png'), les_auc)
|
| 592 |
+
plot_metric_trajectory(iterations, f1_values, 'Best F1',
|
| 593 |
+
os.path.join(args.output_dir, 'trajectory_F1.png'), les_f1)
|
| 594 |
+
plot_metric_trajectory(iterations, thresh_values, 'Best F1 Threshold',
|
| 595 |
+
os.path.join(args.output_dir, 'trajectory_Threshold.png'), les_thresh)
|
| 596 |
+
|
| 597 |
+
# Combined plot
|
| 598 |
+
plot_combined_trajectories(iterations, auc_values, f1_values, thresh_values,
|
| 599 |
+
os.path.join(args.output_dir, 'trajectory_combined.png'), les_values)
|
| 600 |
+
|
| 601 |
+
print(f" Saved trajectory plots to {args.output_dir}")
|
| 602 |
+
|
| 603 |
+
# Generate summary table
|
| 604 |
+
print(f"\nGenerating summary table...")
|
| 605 |
+
summary_csv = os.path.join(args.output_dir, 'summary_table.csv')
|
| 606 |
+
with open(summary_csv, 'w', newline='') as f:
|
| 607 |
+
writer = csv.DictWriter(f, fieldnames=['checkpoint', 'iteration', 'AUC', 'Best_F1',
|
| 608 |
+
'Best_F1_Threshold', 'PRS_samples', 'RRS_samples'])
|
| 609 |
+
writer.writeheader()
|
| 610 |
+
writer.writerows(results)
|
| 611 |
+
|
| 612 |
+
# Add LES row
|
| 613 |
+
with open(summary_csv, 'a', newline='') as f:
|
| 614 |
+
f.write(f"\nLES (Learning Efficiency Score),---,{les_auc:.6f},{les_f1:.6f},{les_thresh:.6f},---,---\n")
|
| 615 |
+
|
| 616 |
+
print(f" Saved summary table to {summary_csv}")
|
| 617 |
+
|
| 618 |
+
# Generate JSON manifest
|
| 619 |
+
manifest = {
|
| 620 |
+
'timestamp': datetime.now().isoformat(),
|
| 621 |
+
'checkpoint_dir': args.checkpoint_dir,
|
| 622 |
+
'prs_file': args.prs_file,
|
| 623 |
+
'rrs_file': args.rrs_file,
|
| 624 |
+
'output_dir': args.output_dir,
|
| 625 |
+
'num_checkpoints': len(checkpoints),
|
| 626 |
+
'num_successful': len(results),
|
| 627 |
+
'LES': {
|
| 628 |
+
'AUC': les_auc,
|
| 629 |
+
'F1': les_f1,
|
| 630 |
+
'Threshold': les_thresh
|
| 631 |
+
},
|
| 632 |
+
'results': results
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
manifest_path = os.path.join(args.output_dir, 'manifest.json')
|
| 636 |
+
with open(manifest_path, 'w') as f:
|
| 637 |
+
json.dump(manifest, f, indent=2, default=str)
|
| 638 |
+
|
| 639 |
+
print(f" Saved manifest to {manifest_path}")
|
| 640 |
+
|
| 641 |
+
# Print final summary
|
| 642 |
+
print(f"\n{'='*60}")
|
| 643 |
+
print("FINAL SUMMARY")
|
| 644 |
+
print(f"{'='*60}")
|
| 645 |
+
print(f"Checkpoints processed: {len(results)}/{len(checkpoints)}")
|
| 646 |
+
print(f"\nLearning Efficiency Scores (LES):")
|
| 647 |
+
print(f" LES-AUC: {les_auc:.6f}")
|
| 648 |
+
print(f" LES-F1: {les_f1:.6f}")
|
| 649 |
+
print(f" LES-Threshold: {les_thresh:.6f}")
|
| 650 |
+
|
| 651 |
+
if results:
|
| 652 |
+
final_result = results[-1]
|
| 653 |
+
print(f"\nFinal Checkpoint Performance:")
|
| 654 |
+
print(f" AUC: {final_result['AUC']:.4f}")
|
| 655 |
+
print(f" Best F1: {final_result['Best_F1']:.4f}")
|
| 656 |
+
print(f" Threshold: {final_result['Best_F1_Threshold']:.4f}")
|
| 657 |
+
|
| 658 |
+
print(f"\nOutputs saved to: {args.output_dir}")
|
| 659 |
+
print(f"{'='*60}\n")
|
| 660 |
+
|
| 661 |
+
if __name__ == '__main__':
|
| 662 |
+
main()
|
LICENSE
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2022 Andrej Karpathy (nanoGPT -- original framework)
|
| 4 |
+
Copyright (c) 2026 Kourosh Salehi-Ashtiani (ppiGPLM -- modifications and additions)
|
| 5 |
+
|
| 6 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 7 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 8 |
+
in the Software without restriction, including without limitation the rights
|
| 9 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 10 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 11 |
+
furnished to do so, subject to the following conditions:
|
| 12 |
+
|
| 13 |
+
The above copyright notice and this permission notice shall be included in all
|
| 14 |
+
copies or substantial portions of the Software.
|
| 15 |
+
|
| 16 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 17 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 18 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 19 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 20 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 21 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 22 |
+
SOFTWARE.
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
This repository is a derivative of nanoGPT (https://github.com/karpathy/nanoGPT)
|
| 27 |
+
by Andrej Karpathy, originally released under the MIT License.
|
| 28 |
+
|
| 29 |
+
Modifications and additions by Kourosh Salehi-Ashtiani include:
|
| 30 |
+
- Repurposing GPT-2 for binary protein-protein interaction classification
|
| 31 |
+
via next-token prediction on structured protein pair prompts
|
| 32 |
+
- sample_fasta3.3_softmax_error_handling3e.py: Batch inference script
|
| 33 |
+
with softmax probability extraction for PPI classification
|
| 34 |
+
- LES-wrapper.py: Learning Efficiency Score evaluation wrapper for
|
| 35 |
+
automated multi-checkpoint ROC analysis
|
| 36 |
+
- roc_analysis_color_threshold_F1e.py: ROC curve analysis with
|
| 37 |
+
threshold-colored visualization and F1 optimization
|
| 38 |
+
- Training configuration for character-level protein pair language models
|
| 39 |
+
- Data preparation pipeline for MED4 PPI datasets
|
MED4-PPIs-low-confidence_ppiGPLM_prompts.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
MED4_100_PRS.csv
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<ps1>,MNTRKKNPKRGVGKTETNTEWLDKVINQLINKDFSQYL,<ps2>,MKLRLFEFYFIKDYLRPWFGLIYSLFFLFFLGAIGYRITEGWDWGDCLWMVLITITTIGFGEVQTLSPEGRIITVLIIVGGLIFIQFTFQKAVRLFESGYFQRVNELRFKRLLRKMENHVILCGYGRVGQEISNQIKTQNIPIIVVESDEDRKKIAEDNGLEVLCADATLDETLKLAGLDKCKSLVVTLPNDAANLYVVLSAKGIRSSIRVIARAGTEEAASKLRLAGASIVVSPYIAAGRAMASMALRPIAIDFLDLLAGSECEIEEFELSNDISLFETAEKITLLELGIGKKSGAKILAIKEDEKLITNPGGDFLLQPGQVLIAFGSKEQLTTLNRLLGNLVVSVELLK,<
|
| 2 |
+
<ps1>,LVESTQSQDSNLGTRLQQDLKNDLIAGLLVVIPLATTIWLSSIVSKFVLTLVTSVPKQLNPFITLNPLLQDLINLTLGLTVPLLAILLIGLMARNFVGRWLLEFGEGTLSKIPVAGAVYKTLKQLLETFLSNKSNRFRRVVLVEYPREGLFSVGFVTGDVGPSLQSELDEKLLSVFIPTAPNPTTGWYTLVPESSVKDLAISVEDAFRTIISVGIVNPDEKDSSSNPTFSKLFSQLRASTNTSST,<ps2>,LVESTQSQDSNLGTRLQQDLKNDLIAGLLVVIPLATTIWLSSIVSKFVLTLVTSVPKQLNPFITLNPLLQDLINLTLGLTVPLLAILLIGLMARNFVGRWLLEFGEGTLSKIPVAGAVYKTLKQLLETFLSNKSNRFRRVVLVEYPREGLFSVGFVTGDVGPSLQSELDEKLLSVFIPTAPNPTTGWYTLVPESSVKDLAISVEDAFRTIISVGIVNPDEKDSSSNPTFSKLFSQLRASTNTSST,<
|
| 3 |
+
<ps1>,LMHNRSLSRELSLLSLGLIKDTADLELNKIQIDEIFESALDSLINHCRDELDNCEADLENVSQHILDSELKEGSNSSFANVRDELKKAFYKMESVMNSLSVTLDFPKLIVSSNQIDIREDVNHRILSIINNLKSIDSEIDEVMDRWRLKRLPRVDRDILRLAYVDIHFLDTPVAVACDEAVNLANKYCDTQGRKMINGVLRRLQRVKVN,<ps2>,LMHNRSLSRELSLLSLGLIKDTADLELNKIQIDEIFESALDSLINHCRDELDNCEADLENVSQHILDSELKEGSNSSFANVRDELKKAFYKMESVMNSLSVTLDFPKLIVSSNQIDIREDVNHRILSIINNLKSIDSEIDEVMDRWRLKRLPRVDRDILRLAYVDIHFLDTPVAVACDEAVNLANKYCDTQGRKMINGVLRRLQRVKVN,<
|
| 4 |
+
<ps1>,MNQFFSRRSFILIPIMSILKFILQPKKVLAAFAASDDDWNLSKEDWKNKLSPESYYILREEGTERAFSSQLNNEKRKGIFYCAGCNQPLFTSDTKFDSGTGWPSFWDPIQGSVETKVDFKLIVPRTEYHCSRCGGHQGHVFNDGPLPTGKRYCNNGLALKFIAE,<ps2>,MNQFFSRRSFILIPIMSILKFILQPKKVLAAFAASDDDWNLSKEDWKNKLSPESYYILREEGTERAFSSQLNNEKRKGIFYCAGCNQPLFTSDTKFDSGTGWPSFWDPIQGSVETKVDFKLIVPRTEYHCSRCGGHQGHVFNDGPLPTGKRYCNNGLALKFIAE,<
|
| 5 |
+
<ps1>,LVKPKSPDNKISNHLQQDVVKIAGKTIFINPFLYWRRFDENTNRWLREPGQMSEEQIQPNRNRFYPEIDWADLSQNQKLVKDASVEMFLKTLELISTFHPQLNSGQLLEVERKMAITKKLPFEKWVTKSFAKKARAEEYEKRKFKRDRFIRSWKEWLSLENTQQALLPIIVVVFVSAFIGWSSGVSKNSCNPYFEQNLDQSI,<ps2>,MKGHKKIRFIFPLVAMYVPLLLLAPKAIAGSFGAEIFCTMRDGGNDHESSWQAAYSYIKKQKGGIFKTSPKQAAGQIIETVVRERDKFSYCVEFLDQLHPDRKLQLENDRKEKRRKKEELLQDKENEDYSKETFDRYSY,<
|
| 6 |
+
<ps1>,MDKPKNKNFANTASRISAIASSVMDLHVRIALQEVDREKRRLISGGVFIAMGGILLLLVLISIHVIFYLTLSKLNNWATEYNLLLIIFVDLFLAGLSLKLGGKLAKGPYLPQTLEGLGKTTKAVLGKK,<ps2>,MDKPKNKNFANTASRISAIASSVMDLHVRIALQEVDREKRRLISGGVFIAMGGILLLLVLISIHVIFYLTLSKLNNWATEYNLLLIIFVDLFLAGLSLKLGGKLAKGPYLPQTLEGLGKTTKAVLGKK,<
|
| 7 |
+
<ps1>,MQSKTKELDPILEVNNLFASIENLPILKGVTISVNPGEIHAIMGRNGCGKSTLSKIIAGHPSYKITKGEIKFTGNDIQSLEPEERAQSGIFLGFQYPIEIPGVSNLEFLRVATNARRKFLNKEELDTFDFEDLVKEKLDLVKMDSAFLSRSINQGFSGGEKKRNEILQMALLEPKIAILDETDSGLDIDALRIVASGIKKISNEETGIILITHYQRLLDEIQPDYVHVMSDGQIIKTGESDLALELEKHGYEWTDNFIKEQ,<ps2>,MQSKTKELDPILEVNNLFASIENLPILKGVTISVNPGEIHAIMGRNGCGKSTLSKIIAGHPSYKITKGEIKFTGNDIQSLEPEERAQSGIFLGFQYPIEIPGVSNLEFLRVATNARRKFLNKEELDTFDFEDLVKEKLDLVKMDSAFLSRSINQGFSGGEKKRNEILQMALLEPKIAILDETDSGLDIDALRIVASGIKKISNEETGIILITHYQRLLDEIQPDYVHVMSDGQIIKTGESDLALELEKHGYEWTDNFIKEQ,<
|
| 8 |
+
<ps1>,MSKVEIYTWRFCPFCIRAKSLLEKKNITFTEHKIDGDDNARELMMERANGKRTVPQIFIDDKSIGGCDELYELEKEDKLDLLLN,<ps2>,MSKVEIYTWRFCPFCIRAKSLLEKKNITFTEHKIDGDDNARELMMERANGKRTVPQIFIDDKSIGGCDELYELEKEDKLDLLLN,<
|
| 9 |
+
<ps1>,VQFIDQANIILKAGKGGNGIVSFRREKFVPAGGPSGGNGGKGGSIIIIADNNLQTLLDFKFNREIFAKDGFKGGPNKRSGASGENTILKVPCGTEIRDFNTGIILGDLTEDKQSLTIAHGGRGGHGNAYYLSNQNRAPESFTEGKEGEIWEVQLELKLLAEVGIIGLPNAGKSTLISVLSSARPKIANYPFTTLIPNLGVVRKADGNGCLFADIPGLISGAAEGVGLGHDFLRHIQRTKILIHLIDSIAENPIRDFEIIEKELKRYGSGLLNKERIVVLNKMELVDENYLQTITKKLENLSKKKVLVISSSLRKGLSPLLSEVWKRI,<ps2>,VPNNQNRDNFIDKAFTVIAESIVKIMPIADKEKKAYIYYRDGLAAQNNGDYSEALDYYNESLLLEENKIDRGETLKNMAIIYMSNGEEDRSIETYQKALEENPKQPSCLKNIGLIYEKRGRFAEQNGDLDQRDMWFDKAAQVWSKAVRLYPGGYLDIENWLKTSGRSSIDIYL,<
|
| 10 |
+
<ps1>,MTTIQQQRTSLLKGWPQFCEWVTSTNNRIYVGWFGVLMIPCLLAAAACFIVAFIAAPPVDIDGIREPVAGSFLYGNNIISGAVVPSSNAIGLHFYPIWEAATVDEWLYNGGPYQLVIFHFLIGISAYMGRQWELSYRLGMRPWICVAYSAPVSAAFAVFLVYPFGQGSFSDGMPLGISGTFNFMFVFQAEHNILMHPFHMAGVAGMFGGSLFSAMHGSLVTSSLIRETTETESQNYGYKFGQEEETYNIVAAHGYFGRLIFQYASFNNSRSLHFFLAVFPVVCVWLTSMGICTMAFNLNGFNFNQSVVDANGKIVPTWGDVLNRANLGMEVMHERNAHNFPLDLAAAESTTVALTAPAIG,<ps2>,MTTIQQQRTSLLKGWPQFCEWVTSTNNRIYVGWFGVLMIPCLLAAAACFIVAFIAAPPVDIDGIREPVAGSFLYGNNIISGAVVPSSNAIGLHFYPIWEAATVDEWLYNGGPYQLVIFHFLIGISAYMGRQWELSYRLGMRPWICVAYSAPVSAAFAVFLVYPFGQGSFSDGMPLGISGTFNFMFVFQAEHNILMHPFHMAGVAGMFGGSLFSAMHGSLVTSSLIRETTETESQNYGYKFGQEEETYNIVAAHGYFGRLIFQYASFNNSRSLHFFLAVFPVVCVWLTSMGICTMAFNLNGFNFNQSVVDANGKIVPTWGDVLNRANLGMEVMHERNAHNFPLDLAAAESTTVALTAPAIG,<
|
| 11 |
+
<ps1>,MSKVELISLTPEAEKTMAYIARVSNPSNQANDKFAGLLRYCIKHEHWSVFEQSCMTLKIETNRGIAAQILRHRSFTFQEFSQRYAETSLLGNEIPIPNLRRQDQKNRQNSIDDIPDELKIKFSEKISKHFQEANKLYEEMLNEGIAKECARFIMPLATPTRIYMTGSCRSWIHYIQLRSKEGTQKEHMEIAEDCKKVFIKYFPSVSEALNWE,<ps2>,MSKVELISLTPEAEKTMAYIARVSNPSNQANDKFAGLLRYCIKHEHWSVFEQSCMTLKIETNRGIAAQILRHRSFTFQEFSQRYAETSLLGNEIPIPNLRRQDQKNRQNSIDDIPDELKIKFSEKISKHFQEANKLYEEMLNEGIAKECARFIMPLATPTRIYMTGSCRSWIHYIQLRSKEGTQKEHMEIAEDCKKVFIKYFPSVSEALNWE,<
|
| 12 |
+
<ps1>,VSGWLFIIFLLLLGGLISTLGDLLGSKIGKARFSILKLRPKKTAILITILTGSLISASSLFLMILVNRQLRVGLFRLGDLQKKLQESKQVLIPLEKEREKLENKIKAKETEFKQLERNIIALRSGKFVIRSGQSLIISEISSSNLKDIKSKIEKIIINANRYTHKIVKPKNKEVKNLLLLRKNHIEEMQNIILKGGNWVINIKSVRNVLTGENFVYAFPEITENKIIVRKGEKITKIDFKQEDFNKKDFGDKVNFLLSSSLAEIKRRGSLVNEIKLRGDSIKELRDFLNKNDKTNFELEAVSLFNSKTAQPVIVELNVNYPES,<ps2>,VSGWLFIIFLLLLGGLISTLGDLLGSKIGKARFSILKLRPKKTAILITILTGSLISASSLFLMILVNRQLRVGLFRLGDLQKKLQESKQVLIPLEKEREKLENKIKAKETEFKQLERNIIALRSGKFVIRSGQSLIISEISSSNLKDIKSKIEKIIINANRYTHKIVKPKNKEVKNLLLLRKNHIEEMQNIILKGGNWVINIKSVRNVLTGENFVYAFPEITENKIIVRKGEKITKIDFKQEDFNKKDFGDKVNFLLSSSLAEIKRRGSLVNEIKLRGDSIKELRDFLNKNDKTNFELEAVSLFNSKTAQPVIVELNVNYPES,<
|
| 13 |
+
<ps1>,MNDRIIEFEPLIEGILIKRYKRFLADIQIENGEIVTAHCANTGPMKGLLNEGANVRISFSSSTKRKLPWTWEQVKVIGRDNKEVWVGINTLFANKLIRKVIEQNLFKDKLGEIAKIKSEVPYGKDKKSRIDFLLTPKSSNPDNRNIYVEVKNTTWTKNNVALFPDTETKRGQKHLIELKGLIPESKSVLVPCITRKDIDYFAPGDESDPLYGELFRESISAGMLLIPCCFEFHSDHVAWKGFKPLKLN,<ps2>,MNDRIIEFEPLIEGILIKRYKRFLADIQIENGEIVTAHCANTGPMKGLLNEGANVRISFSSSTKRKLPWTWEQVKVIGRDNKEVWVGINTLFANKLIRKVIEQNLFKDKLGEIAKIKSEVPYGKDKKSRIDFLLTPKSSNPDNRNIYVEVKNTTWTKNNVALFPDTETKRGQKHLIELKGLIPESKSVLVPCITRKDIDYFAPGDESDPLYGELFRESISAGMLLIPCCFEFHSDHVAWKGFKPLKLN,<
|
| 14 |
+
<ps1>,MRRSLRDSIVGFSLLGGLLVFTFFSFWLRGVKLSSKNWYLFAEFNNASGLSKKSPVTYRGILVGSIEDILFTNESIKAKIVLNNPEIILPKPAFARVVTNSFLGGDVQVALETSEKTIPKNTPKAISDKCDSKLIICQGDTITGKQLSSLSNITNRINQLLKESNQENLIENVVNSIDQFDKTQENLDELIYLSKQEIIRVKPLIKEVTIAAGHLNNILSTINDEETLKDIKLTIEAAESISGKFDNMSDDFEQLMKDKELTKSIRDLTIGLSKFLNEIYP,<ps2>,MRRSLRDSIVGFSLLGGLLVFTFFSFWLRGVKLSSKNWYLFAEFNNASGLSKKSPVTYRGILVGSIEDILFTNESIKAKIVLNNPEIILPKPAFARVVTNSFLGGDVQVALETSEKTIPKNTPKAISDKCDSKLIICQGDTITGKQLSSLSNITNRINQLLKESNQENLIENVVNSIDQFDKTQENLDELIYLSKQEIIRVKPLIKEVTIAAGHLNNILSTINDEETLKDIKLTIEAAESISGKFDNMSDDFEQLMKDKELTKSIRDLTIGLSKFLNEIYP,<
|
| 15 |
+
<ps1>,MASSETYEFLFVKPGDHVVIKNEKPPGNTQNGRQEYWIGQIISCIGGARNPNSWTLFQVADIDNGEIIIINADTVERILKTAEN,<ps2>,MASSETYEFLFVKPGDHVVIKNEKPPGNTQNGRQEYWIGQIISCIGGARNPNSWTLFQVADIDNGEIIIINADTVERILKTAEN,<
|
| 16 |
+
<ps1>,LSVDRELLKEVTQELWNTVKKLRPEIDRETRLQLVLKALLTIGDLPDQLQAAMVVGVCAEMDKSDFENADGNSNTKEESNSTSVDTSTGRKVFRRSSAK,<ps2>,LSVDRELLKEVTQELWNTVKKLRPEIDRETRLQLVLKALLTIGDLPDQLQAAMVVGVCAEMDKSDFENADGNSNTKEESNSTSVDTSTGRKVFRRSSAK,<
|
| 17 |
+
<ps1>,LQIGDKVPQFSLLDQNGTKRSNNGLKTPLVLFFYPKDDTPGCTIEVCGFRDKYDLFKVLGAQVWGVSNGSSSSHLAFANKNKLQYPLLCDKNDSLRKAFKVPKVLGLLDGRVTYVIDRNGFVKHIFRDLLNGPEHIKEAIRVLKEIQNQ,<ps2>,LQIGDKVPQFSLLDQNGTKRSNNGLKTPLVLFFYPKDDTPGCTIEVCGFRDKYDLFKVLGAQVWGVSNGSSSSHLAFANKNKLQYPLLCDKNDSLRKAFKVPKVLGLLDGRVTYVIDRNGFVKHIFRDLLNGPEHIKEAIRVLKEIQNQ,<
|
| 18 |
+
<ps1>,MFSINKSNFMKKIGMQAVDEAIENGIDLDGTPIPSKMLELYNRIMSEENKRERSGVKKSMRNRCVKTGSKHFDKETLDQLLIDSGWEGLKEKEILFFYS,<ps2>,MFSINKSNFMKKIGMQAVDEAIENGIDLDGTPIPSKMLELYNRIMSEENKRERSGVKKSMRNRCVKTGSKHFDKETLDQLLIDSGWEGLKEKEILFFYS,<
|
| 19 |
+
<ps1>,VNSNSSNQVGKNIRRTGFLIVLSYLLIVLIMKVLEANNFFGYSLSSFSNDIFAPPSLKHLCGTDRLGRDVCLRTLQGSSIAIEVVFLAIFFALILGLPLGLLSGYFGGILDKCLSLVMDTIFSIPVILLAVVVAFVLGKGIINASIALCIVYSPQYFRLIRNQTMLIKSETYVEAARVSGADVKTIIFKYILPNVITPLPILLTLNAADAVLVLGSLGFLGLGVPANVPEWGSDLNLALAAIPTGIWWTALFPGLAMFFLVLGLSFIGEELENIFEN,<ps2>,LEGINLNQIGVSFKGSGSYVPNQILTNQEISKKVETSDEWIKSRTGISQRRISGLSENVSEMGYKAALGAIEMARWDIETIDLIILATSTPNDLFGSAPEIQSKLGAINAVAFDLTAACSGFLFAAITATQFLKAGSYKRAVVIGSDQLSSYVDWNDRRSCILFGDGAGAIAIEGTNELDNLLGFSMRTDGQRGSFLNLPSQNNQDLIINDINFSSGGFSSIKMNGQEVYKFAVREVPLIIDNLFKKTNFNSEKINWLLLHQANQRILDSVGERLNVSTEKILSNLSNYGNTSAATIPLMLDEAIRNKKIKENDIIATSGFGAGLSWGAALIRWG,<
|
| 20 |
+
<ps1>,MSNTESLTGKVALITGASRGIGKEIALELSNLGAKVIINYSSSDEKAEEVVNLIKESGGKVHKLKFDVSKEESVSKAFEEIIKINGAIDILVNNAGITRDGLLMRMKSEQWDDVLNTNLKGVFLCTKYASKFMIKKRSGKIINISSIVGIIGNPGQANYSAAKAGVIGFTKTCAKEFASRGINVNAIAPGFIETEMTEKLNNEEIIKAIPLGKLGSCSQIANLVSFLVSSNAGSYITGQTISIDGGMSI,<ps2>,MSNTESLTGKVALITGASRGIGKEIALELSNLGAKVIINYSSSDEKAEEVVNLIKESGGKVHKLKFDVSKEESVSKAFEEIIKINGAIDILVNNAGITRDGLLMRMKSEQWDDVLNTNLKGVFLCTKYASKFMIKKRSGKIINISSIVGIIGNPGQANYSAAKAGVIGFTKTCAKEFASRGINVNAIAPGFIETEMTEKLNNEEIIKAIPLGKLGSCSQIANLVSFLVSSNAGSYITGQTISIDGGMSI,<
|
| 21 |
+
<ps1>,MHKVLAIETSCDETSVSIVSNSGDIYKIHSNIVASQIEDHSKWGGVVPELAARKHLELLPFVLEQALEESKIRIEKIDVIASTVTPGLVGCLRVGSITARSLCTLYSKPFLGIHHLEGHLSSILFSKNYPKPPFLTLLVSGGHTELIKVGERRKMQRLGRSYDDAAGEAFDKVGRLLGLSYPGGPAIAKIAKKGNASKFNLPKCKISDKEGGFLKYDFSFSGLKTAVLRLVEKINLNGDEIPIPDIAASFERVVAEVLVERTIKCANDYGLDNIVVVGGVAANDTLRKMMISEACKKSIKVHLAPINLCTDNAAMIGAAALYRLKFKAYESSLKLGISGRLPIDQANTLYENKPPF,<ps2>,LVKPKSPDNKISNHLQQDVVKIAGKTIFINPFLYWRRFDENTNRWLREPGQMSEEQIQPNRNRFYPEIDWADLSQNQKLVKDASVEMFLKTLELISTFHPQLNSGQLLEVERKMAITKKLPFEKWVTKSFAKKARAEEYEKRKFKRDRFIRSWKEWLSLENTQQALLPIIVVVFVSAFIGWSSGVSKNSCNPYFEQNLDQSI,<
|
| 22 |
+
<ps1>,MHKVLAIETSCDETSVSIVSNSGDIYKIHSNIVASQIEDHSKWGGVVPELAARKHLELLPFVLEQALEESKIRIEKIDVIASTVTPGLVGCLRVGSITARSLCTLYSKPFLGIHHLEGHLSSILFSKNYPKPPFLTLLVSGGHTELIKVGERRKMQRLGRSYDDAAGEAFDKVGRLLGLSYPGGPAIAKIAKKGNASKFNLPKCKISDKEGGFLKYDFSFSGLKTAVLRLVEKINLNGDEIPIPDIAASFERVVAEVLVERTIKCANDYGLDNIVVVGGVAANDTLRKMMISEACKKSIKVHLAPINLCTDNAAMIGAAALYRLKFKAYESSLKLGISGRLPIDQANTLYENKPPF,<ps2>,VNSNSSNQVGKNIRRTGFLIVLSYLLIVLIMKVLEANNFFGYSLSSFSNDIFAPPSLKHLCGTDRLGRDVCLRTLQGSSIAIEVVFLAIFFALILGLPLGLLSGYFGGILDKCLSLVMDTIFSIPVILLAVVVAFVLGKGIINASIALCIVYSPQYFRLIRNQTMLIKSETYVEAARVSGADVKTIIFKYILPNVITPLPILLTLNAADAVLVLGSLGFLGLGVPANVPEWGSDLNLALAAIPTGIWWTALFPGLAMFFLVLGLSFIGEELENIFEN,<
|
| 23 |
+
<ps1>,MHKVLAIETSCDETSVSIVSNSGDIYKIHSNIVASQIEDHSKWGGVVPELAARKHLELLPFVLEQALEESKIRIEKIDVIASTVTPGLVGCLRVGSITARSLCTLYSKPFLGIHHLEGHLSSILFSKNYPKPPFLTLLVSGGHTELIKVGERRKMQRLGRSYDDAAGEAFDKVGRLLGLSYPGGPAIAKIAKKGNASKFNLPKCKISDKEGGFLKYDFSFSGLKTAVLRLVEKINLNGDEIPIPDIAASFERVVAEVLVERTIKCANDYGLDNIVVVGGVAANDTLRKMMISEACKKSIKVHLAPINLCTDNAAMIGAAALYRLKFKAYESSLKLGISGRLPIDQANTLYENKPPF,<ps2>,MHKVLAIETSCDETSVSIVSNSGDIYKIHSNIVASQIEDHSKWGGVVPELAARKHLELLPFVLEQALEESKIRIEKIDVIASTVTPGLVGCLRVGSITARSLCTLYSKPFLGIHHLEGHLSSILFSKNYPKPPFLTLLVSGGHTELIKVGERRKMQRLGRSYDDAAGEAFDKVGRLLGLSYPGGPAIAKIAKKGNASKFNLPKCKISDKEGGFLKYDFSFSGLKTAVLRLVEKINLNGDEIPIPDIAASFERVVAEVLVERTIKCANDYGLDNIVVVGGVAANDTLRKMMISEACKKSIKVHLAPINLCTDNAAMIGAAALYRLKFKAYESSLKLGISGRLPIDQANTLYENKPPF,<
|
| 24 |
+
<ps1>,MIKNPIQEVTNKLQYRAIGIVKGIYKPNNIDQLNRGTLTDKEGKIIETVILGKAIALIKKYINLEKDYFWIVYPRNKNINNLHLQVAGIWDPYQLNQFDKNNSEKDPNQLLEELNLNNNYFSIRGELVYVNIKKKEIVIKICSSPPSKRSKYLTFKIIIEGEIPLQFLNNFVSLDVIRDGNTLRMAKYEIIEKIKPEKV,<ps2>,MIKNPIQEVTNKLQYRAIGIVKGIYKPNNIDQLNRGTLTDKEGKIIETVILGKAIALIKKYINLEKDYFWIVYPRNKNINNLHLQVAGIWDPYQLNQFDKNNSEKDPNQLLEELNLNNNYFSIRGELVYVNIKKKEIVIKICSSPPSKRSKYLTFKIIIEGEIPLQFLNNFVSLDVIRDGNTLRMAKYEIIEKIKPEKV,<
|
| 25 |
+
<ps1>,MTKFKLKIASRRSKLAMVQTLWVKEQLEKNIPDLEVSIEAMATQGDKILDVALAKIGDKGLFTKELEAQMLVGHADIAVHSLKDLPTNLPDGLTLGCITKREDPSDALVVNKKNKIYQLESLPPGSIVGTSSLRRLAQLRYKFPHLDFKDIRGNVITRIEKLDSGEFDCIILAAAGLKRLGFESRVHQIIPNEISLHAVGQGALGIECKSDDKEVLKIISVLEDKVSSQRCLAERSFLRELEGGCQVPIGVNSSIQNDEIALIGMVASIDGKRLIKNESIGNIKYPEEVGKKLAEKLKLQGADKILSEIFEQFRDK,<ps2>,VPNNQNRDNFIDKAFTVIAESIVKIMPIADKEKKAYIYYRDGLAAQNNGDYSEALDYYNESLLLEENKIDRGETLKNMAIIYMSNGEEDRSIETYQKALEENPKQPSCLKNIGLIYEKRGRFAEQNGDLDQRDMWFDKAAQVWSKAVRLYPGGYLDIENWLKTSGRSSIDIYL,<
|
| 26 |
+
<ps1>,MTKFKLKIASRRSKLAMVQTLWVKEQLEKNIPDLEVSIEAMATQGDKILDVALAKIGDKGLFTKELEAQMLVGHADIAVHSLKDLPTNLPDGLTLGCITKREDPSDALVVNKKNKIYQLESLPPGSIVGTSSLRRLAQLRYKFPHLDFKDIRGNVITRIEKLDSGEFDCIILAAAGLKRLGFESRVHQIIPNEISLHAVGQGALGIECKSDDKEVLKIISVLEDKVSSQRCLAERSFLRELEGGCQVPIGVNSSIQNDEIALIGMVASIDGKRLIKNESIGNIKYPEEVGKKLAEKLKLQGADKILSEIFEQFRDK,<ps2>,VNSNSSNQVGKNIRRTGFLIVLSYLLIVLIMKVLEANNFFGYSLSSFSNDIFAPPSLKHLCGTDRLGRDVCLRTLQGSSIAIEVVFLAIFFALILGLPLGLLSGYFGGILDKCLSLVMDTIFSIPVILLAVVVAFVLGKGIINASIALCIVYSPQYFRLIRNQTMLIKSETYVEAARVSGADVKTIIFKYILPNVITPLPILLTLNAADAVLVLGSLGFLGLGVPANVPEWGSDLNLALAAIPTGIWWTALFPGLAMFFLVLGLSFIGEELENIFEN,<
|
| 27 |
+
<ps1>,MTKFKLKIASRRSKLAMVQTLWVKEQLEKNIPDLEVSIEAMATQGDKILDVALAKIGDKGLFTKELEAQMLVGHADIAVHSLKDLPTNLPDGLTLGCITKREDPSDALVVNKKNKIYQLESLPPGSIVGTSSLRRLAQLRYKFPHLDFKDIRGNVITRIEKLDSGEFDCIILAAAGLKRLGFESRVHQIIPNEISLHAVGQGALGIECKSDDKEVLKIISVLEDKVSSQRCLAERSFLRELEGGCQVPIGVNSSIQNDEIALIGMVASIDGKRLIKNESIGNIKYPEEVGKKLAEKLKLQGADKILSEIFEQFRDK,<ps2>,MTKFKLKIASRRSKLAMVQTLWVKEQLEKNIPDLEVSIEAMATQGDKILDVALAKIGDKGLFTKELEAQMLVGHADIAVHSLKDLPTNLPDGLTLGCITKREDPSDALVVNKKNKIYQLESLPPGSIVGTSSLRRLAQLRYKFPHLDFKDIRGNVITRIEKLDSGEFDCIILAAAGLKRLGFESRVHQIIPNEISLHAVGQGALGIECKSDDKEVLKIISVLEDKVSSQRCLAERSFLRELEGGCQVPIGVNSSIQNDEIALIGMVASIDGKRLIKNESIGNIKYPEEVGKKLAEKLKLQGADKILSEIFEQFRDK,<
|
| 28 |
+
<ps1>,MLVKKMTELFSRFFVKAISFAICISVFFTLFNSPSYAAKTSMTGDYAKDTISVVKTLQIAVETPKDSPDKDKVRDESLALITDYISRYRNRGMVNKTQSFTTMQTALNAMAGHYKNFATRPLPDKLKERLTKEFTLAEKMVLRES,<ps2>,MLVKKMTELFSRFFVKAISFAICISVFFTLFNSPSYAAKTSMTGDYAKDTISVVKTLQIAVETPKDSPDKDKVRDESLALITDYISRYRNRGMVNKTQSFTTMQTALNAMAGHYKNFATRPLPDKLKERLTKEFTLAEKMVLRES,<
|
| 29 |
+
<ps1>,LKKITSILIIFFLIVLYPIRTYSAEILQINNSSSILVGDQNRDLPIKLFCVEINNEDDEKIALNLLKKEFPRGSKVKIKPIGFKENILTARVFNINETKEMSDLLISKNLSKETCQN,<ps2>,LKKITSILIIFFLIVLYPIRTYSAEILQINNSSSILVGDQNRDLPIKLFCVEINNEDDEKIALNLLKKEFPRGSKVKIKPIGFKENILTARVFNINETKEMSDLLISKNLSKETCQN,<
|
| 30 |
+
<ps1>,MESIFNNSFATLVAYVGIVSIYLLVIPLILFYWMNNRWNVMGKFERLIVYGLVFLFFPGLILFSPFLNLRLRGDSKG,<ps2>,MESIFNNSFATLVAYVGIVSIYLLVIPLILFYWMNNRWNVMGKFERLIVYGLVFLFFPGLILFSPFLNLRLRGDSKG,<
|
| 31 |
+
<ps1>,LTKGKVVQIGLLISLLGLLSYKLAPQLGIDNFTASTISNFVLIVIVISWVTSYVLRVLNGKMTFMEQRKRYRKEYEKIVNDKLETKFNLLPKEEQEKLMEDLEKNP,<ps2>,LTKGKVVQIGLLISLLGLLSYKLAPQLGIDNFTASTISNFVLIVIVISWVTSYVLRVLNGKMTFMEQRKRYRKEYEKIVNDKLETKFNLLPKEEQEKLMEDLEKNP,<
|
| 32 |
+
<ps1>,MNDSYYKDKEKIYDAEVLESSSLDENIIIKILIRAGRTIAKPALEVLEMALDPFTPTQVRVSLMAALAYLIMPFDLFPDFMPLVGYSDDFVALTAVLSIWSRYMTPAIRARAERKLNKLFPFVK,<ps2>,MNDSYYKDKEKIYDAEVLESSSLDENIIIKILIRAGRTIAKPALEVLEMALDPFTPTQVRVSLMAALAYLIMPFDLFPDFMPLVGYSDDFVALTAVLSIWSRYMTPAIRARAERKLNKLFPFVK,<
|
| 33 |
+
<ps1>,MLRSIFAGFFAIVLTLGLGISSVSAKTVEVKLGTDAGMLAFEPSSVTISTGDTVKFINNKLAPHNAVFDGHEELSHADLAFAPGESWEETFDTAGTFDYYCEPHRGAGMVGKVIVE,<ps2>,MKGHKKIRFIFPLVAMYVPLLLLAPKAIAGSFGAEIFCTMRDGGNDHESSWQAAYSYIKKQKGGIFKTSPKQAAGQIIETVVRERDKFSYCVEFLDQLHPDRKLQLENDRKEKRRKKEELLQDKENEDYSKETFDRYSY,<
|
| 34 |
+
<ps1>,MLRSIFAGFFAIVLTLGLGISSVSAKTVEVKLGTDAGMLAFEPSSVTISTGDTVKFINNKLAPHNAVFDGHEELSHADLAFAPGESWEETFDTAGTFDYYCEPHRGAGMVGKVIVE,<ps2>,VQFIDQANIILKAGKGGNGIVSFRREKFVPAGGPSGGNGGKGGSIIIIADNNLQTLLDFKFNREIFAKDGFKGGPNKRSGASGENTILKVPCGTEIRDFNTGIILGDLTEDKQSLTIAHGGRGGHGNAYYLSNQNRAPESFTEGKEGEIWEVQLELKLLAEVGIIGLPNAGKSTLISVLSSARPKIANYPFTTLIPNLGVVRKADGNGCLFADIPGLISGAAEGVGLGHDFLRHIQRTKILIHLIDSIAENPIRDFEIIEKELKRYGSGLLNKERIVVLNKMELVDENYLQTITKKLENLSKKKVLVISSSLRKGLSPLLSEVWKRI,<
|
| 35 |
+
<ps1>,MTNKKRILSGVQPTGDLHIGNWLGAINNWVELQEKHETFLCVVDLHAITTEYDTKQLSKNTLSTAALYIACGINPKICSIFVQSQISAHSELCWILNCMTPINWMERMIQFKEKSIQQGNNVSIGLFDYPILMAADILLYDADYVPVGEDQKQHLELAKDIAQQRINAKFGKEENILKIPQPIIMKKGSKIMSLNDGSKKMSKSDINEGSRINLLDTPEIITKKIKRAKSDSYMGMEFNNPERPESRNLLMIYSLLSGKEVSELENDLSQTGWGTFKKIFTEQIIESLKPIQERYQVLINDPHELNKILIQGKEKAEVVANKTLSRVKSELGFFEIEK,<ps2>,MTNKKRILSGVQPTGDLHIGNWLGAINNWVELQEKHETFLCVVDLHAITTEYDTKQLSKNTLSTAALYIACGINPKICSIFVQSQISAHSELCWILNCMTPINWMERMIQFKEKSIQQGNNVSIGLFDYPILMAADILLYDADYVPVGEDQKQHLELAKDIAQQRINAKFGKEENILKIPQPIIMKKGSKIMSLNDGSKKMSKSDINEGSRINLLDTPEIITKKIKRAKSDSYMGMEFNNPERPESRNLLMIYSLLSGKEVSELENDLSQTGWGTFKKIFTEQIIESLKPIQERYQVLINDPHELNKILIQGKEKAEVVANKTLSRVKSELGFFEIEK,<
|
| 36 |
+
<ps1>,LGRSRKTNQLIREFLSELKEVLTKDGSITLRSLIFQENFHSLEGALKETEIKFIYPSDLKRFKDKSLNVLDICFGLGYNSASLFNNVIRQNSLINWYALEIDKKPLEYSLGNKSFQKLWHPKVFKILKALLKNSKYKDQFFDCDILWGDAREKIKNIPANIKFDLIYLDGFSPQKCPQVWSVEFLSKVTQKLNPQGYLITYSCSAAIRSTLKDFGLNIFNNKPNLVSKNLWSYGTIAVKNIDEKVLQNNLYLKKLSWMEEEHLLTKASIPYRDPTLNSNPKDIIKKRVQEQFLSNLKTSKKWRDKWGMTK,<ps2>,LGRSRKTNQLIREFLSELKEVLTKDGSITLRSLIFQENFHSLEGALKETEIKFIYPSDLKRFKDKSLNVLDICFGLGYNSASLFNNVIRQNSLINWYALEIDKKPLEYSLGNKSFQKLWHPKVFKILKALLKNSKYKDQFFDCDILWGDAREKIKNIPANIKFDLIYLDGFSPQKCPQVWSVEFLSKVTQKLNPQGYLITYSCSAAIRSTLKDFGLNIFNNKPNLVSKNLWSYGTIAVKNIDEKVLQNNLYLKKLSWMEEEHLLTKASIPYRDPTLNSNPKDIIKKRVQEQFLSNLKTSKKWRDKWGMTK,<
|
| 37 |
+
<ps1>,LKNGADSIQVFSDLELLKKTAKKWDGNKRLMLAERGGKTIDGFDLGNSPLSVTKETVQGKRLFMSTTNGTKSLKKVQNVENLFAMSLPNRKAVAERIISLNKKNVLILGSGWEGSYSLEDSLAAGALAIYIKENFNSEVNILNDELQASLALWNVWKNDILKCLKTATHGKRLTSLGDYEDDFKCCSELDCLDIVPTQVERGVIRAS,<ps2>,LKNGADSIQVFSDLELLKKTAKKWDGNKRLMLAERGGKTIDGFDLGNSPLSVTKETVQGKRLFMSTTNGTKSLKKVQNVENLFAMSLPNRKAVAERIISLNKKNVLILGSGWEGSYSLEDSLAAGALAIYIKENFNSEVNILNDELQASLALWNVWKNDILKCLKTATHGKRLTSLGDYEDDFKCCSELDCLDIVPTQVERGVIRAS,<
|
| 38 |
+
<ps1>,LISEIKELCLKANAIILAHYYQAPEIQDIADFIGDSLDLSRKAANNDADTIVFCGVHFMAETAKILSPNKTVLLPDIDAGCSLADDCPAEEFQKFREENPDHYVVSYINCTAEVKAQSDLICTSSNAVSLVEKIPKDKKIIFAPDKNLGRWVQKNSGRKLKLWPGSCIVHETFSEEALLKLKYKHPDAKVIAHPECSQNLLVLSDFIGSTSKLLDFVSNDYSDTYMVLTEPGIIHQMKKKEPNKNFIEVPDIDGCKCNECPYMKLNTLEKILDCLKNNSPSIELDPEIIKKAYKPIKRMLDMSI,<ps2>,LISEIKELCLKANAIILAHYYQAPEIQDIADFIGDSLDLSRKAANNDADTIVFCGVHFMAETAKILSPNKTVLLPDIDAGCSLADDCPAEEFQKFREENPDHYVVSYINCTAEVKAQSDLICTSSNAVSLVEKIPKDKKIIFAPDKNLGRWVQKNSGRKLKLWPGSCIVHETFSEEALLKLKYKHPDAKVIAHPECSQNLLVLSDFIGSTSKLLDFVSNDYSDTYMVLTEPGIIHQMKKKEPNKNFIEVPDIDGCKCNECPYMKLNTLEKILDCLKNNSPSIELDPEIIKKAYKPIKRMLDMSI,<
|
| 39 |
+
<ps1>,MNRKSNNSNPTGNLDYDKILEEEIINSYENKFEANSNINNKNKRFYRLKRTPLEVINRLFFFFFVGSFIFSFFLAYSENKVWFIIYLISAFSCIFYTPNRKALKELIAAWPNIEDLIKGRSLWRKDNK,<ps2>,MNRKSNNSNPTGNLDYDKILEEEIINSYENKFEANSNINNKNKRFYRLKRTPLEVINRLFFFFFVGSFIFSFFLAYSENKVWFIIYLISAFSCIFYTPNRKALKELIAAWPNIEDLIKGRSLWRKDNK,<
|
| 40 |
+
<ps1>,MKLRLFEFYFIKDYLRPWFGLIYSLFFLFFLGAIGYRITEGWDWGDCLWMVLITITTIGFGEVQTLSPEGRIITVLIIVGGLIFIQFTFQKAVRLFESGYFQRVNELRFKRLLRKMENHVILCGYGRVGQEISNQIKTQNIPIIVVESDEDRKKIAEDNGLEVLCADATLDETLKLAGLDKCKSLVVTLPNDAANLYVVLSAKGIRSSIRVIARAGTEEAASKLRLAGASIVVSPYIAAGRAMASMALRPIAIDFLDLLAGSECEIEEFELSNDISLFETAEKITLLELGIGKKSGAKILAIKEDEKLITNPGGDFLLQPGQVLIAFGSKEQLTTLNRLLGNLVVSVELLK,<ps2>,VPNNQNRDNFIDKAFTVIAESIVKIMPIADKEKKAYIYYRDGLAAQNNGDYSEALDYYNESLLLEENKIDRGETLKNMAIIYMSNGEEDRSIETYQKALEENPKQPSCLKNIGLIYEKRGRFAEQNGDLDQRDMWFDKAAQVWSKAVRLYPGGYLDIENWLKTSGRSSIDIYL,<
|
| 41 |
+
<ps1>,MKLRLFEFYFIKDYLRPWFGLIYSLFFLFFLGAIGYRITEGWDWGDCLWMVLITITTIGFGEVQTLSPEGRIITVLIIVGGLIFIQFTFQKAVRLFESGYFQRVNELRFKRLLRKMENHVILCGYGRVGQEISNQIKTQNIPIIVVESDEDRKKIAEDNGLEVLCADATLDETLKLAGLDKCKSLVVTLPNDAANLYVVLSAKGIRSSIRVIARAGTEEAASKLRLAGASIVVSPYIAAGRAMASMALRPIAIDFLDLLAGSECEIEEFELSNDISLFETAEKITLLELGIGKKSGAKILAIKEDEKLITNPGGDFLLQPGQVLIAFGSKEQLTTLNRLLGNLVVSVELLK,<ps2>,VSGWLFIIFLLLLGGLISTLGDLLGSKIGKARFSILKLRPKKTAILITILTGSLISASSLFLMILVNRQLRVGLFRLGDLQKKLQESKQVLIPLEKEREKLENKIKAKETEFKQLERNIIALRSGKFVIRSGQSLIISEISSSNLKDIKSKIEKIIINANRYTHKIVKPKNKEVKNLLLLRKNHIEEMQNIILKGGNWVINIKSVRNVLTGENFVYAFPEITENKIIVRKGEKITKIDFKQEDFNKKDFGDKVNFLLSSSLAEIKRRGSLVNEIKLRGDSIKELRDFLNKNDKTNFELEAVSLFNSKTAQPVIVELNVNYPES,<
|
| 42 |
+
<ps1>,MKLRLFEFYFIKDYLRPWFGLIYSLFFLFFLGAIGYRITEGWDWGDCLWMVLITITTIGFGEVQTLSPEGRIITVLIIVGGLIFIQFTFQKAVRLFESGYFQRVNELRFKRLLRKMENHVILCGYGRVGQEISNQIKTQNIPIIVVESDEDRKKIAEDNGLEVLCADATLDETLKLAGLDKCKSLVVTLPNDAANLYVVLSAKGIRSSIRVIARAGTEEAASKLRLAGASIVVSPYIAAGRAMASMALRPIAIDFLDLLAGSECEIEEFELSNDISLFETAEKITLLELGIGKKSGAKILAIKEDEKLITNPGGDFLLQPGQVLIAFGSKEQLTTLNRLLGNLVVSVELLK,<ps2>,LGRSRKTNQLIREFLSELKEVLTKDGSITLRSLIFQENFHSLEGALKETEIKFIYPSDLKRFKDKSLNVLDICFGLGYNSASLFNNVIRQNSLINWYALEIDKKPLEYSLGNKSFQKLWHPKVFKILKALLKNSKYKDQFFDCDILWGDAREKIKNIPANIKFDLIYLDGFSPQKCPQVWSVEFLSKVTQKLNPQGYLITYSCSAAIRSTLKDFGLNIFNNKPNLVSKNLWSYGTIAVKNIDEKVLQNNLYLKKLSWMEEEHLLTKASIPYRDPTLNSNPKDIIKKRVQEQFLSNLKTSKKWRDKWGMTK,<
|
| 43 |
+
<ps1>,MSIAKKALLFTSALALIAGPSVTASTRLSGAGASFPAKIYTRWFSDLAKSGGPRVNYQAVGSGSGRKAFIDQTVNFGASDDPMKDKDIAKVTRGLVQIPMVGGTIAFGYNYDCDLKLTQEQAVQVAMGMVKNWKELGCKAGKLTWAHRSDGSGTTKAFTNSMEAFSPTWTLGTGKSVKWPAGVGAKGNSGVAGVIQNTPGAIGYVNQSYIKGNVKAAALQNLSGEFLKPSVEAGAKALNGITLDENLAGKNPNPTAKGAYPIASLTWILAYEKGNGRNTKAIKKSLSTLLSDEYQDKAPTLGFVPLKGDILEKSRAAVKKIGR,<ps2>,MSIAKKALLFTSALALIAGPSVTASTRLSGAGASFPAKIYTRWFSDLAKSGGPRVNYQAVGSGSGRKAFIDQTVNFGASDDPMKDKDIAKVTRGLVQIPMVGGTIAFGYNYDCDLKLTQEQAVQVAMGMVKNWKELGCKAGKLTWAHRSDGSGTTKAFTNSMEAFSPTWTLGTGKSVKWPAGVGAKGNSGVAGVIQNTPGAIGYVNQSYIKGNVKAAALQNLSGEFLKPSVEAGAKALNGITLDENLAGKNPNPTAKGAYPIASLTWILAYEKGNGRNTKAIKKSLSTLLSDEYQDKAPTLGFVPLKGDILEKSRAAVKKIGR,<
|
| 44 |
+
<ps1>,MKIGINGFGRIGRLVFRALWDRADTEITHINEMAGDSNAAAHLLEFDSVHGRWVKDIKVKEEEIIIDGKKLAYTSFKNYLDVPWEKSSVDIILECTGKNKKPDKLNPYFDSLGMKRVIVACPVKGIVAEAESLNIVYGINQNLYDPSKHKLVTAASCTTNCLAPIVKVINENFSIKHGAITTIHDVTNTQVPVDFYKSDLRRARGCMQSLIPTTTGSAKAIAEIFPELKGKLNGHAVRVPLLNGSLTDAVFELNNAVTEEQVNNEFKKASETYLEGILGYEERPLVSADYVNDSRSSIVDSLSTMVVNSNLLKIYAWYDNEWGYSCRLADLTEYVIKKEI,<ps2>,MKIGINGFGRIGRLVFRALWDRADTEITHINEMAGDSNAAAHLLEFDSVHGRWVKDIKVKEEEIIIDGKKLAYTSFKNYLDVPWEKSSVDIILECTGKNKKPDKLNPYFDSLGMKRVIVACPVKGIVAEAESLNIVYGINQNLYDPSKHKLVTAASCTTNCLAPIVKVINENFSIKHGAITTIHDVTNTQVPVDFYKSDLRRARGCMQSLIPTTTGSAKAIAEIFPELKGKLNGHAVRVPLLNGSLTDAVFELNNAVTEEQVNNEFKKASETYLEGILGYEERPLVSADYVNDSRSSIVDSLSTMVVNSNLLKIYAWYDNEWGYSCRLADLTEYVIKKEI,<
|
| 45 |
+
<ps1>,MKNSIKITQLFLLLIFLTSCKATANKQELIIDSEEQESQQTKLSKSKMEVRYSCGEDGISDFLNDGWIISKQYTEEKICTWKSFPATKDCDMEKDKGCKITTPDKIGEEKVYLLEK,<ps2>,MKNSIKITQLFLLLIFLTSCKATANKQELIIDSEEQESQQTKLSKSKMEVRYSCGEDGISDFLNDGWIISKQYTEEKICTWKSFPATKDCDMEKDKGCKITTPDKIGEEKVYLLEK,<
|
| 46 |
+
<ps1>,MNFEIKNVFLTIEGKSIVNDVSIKVCPGEIVGLMGPNGAGKTSTFNLAVGNLRPDKGDILINSKSIKNLPLPIRAKLGLGYLTQEASIFRDLTVKENIDLALENSFSSRAIVRNKREKIINEFNLNKVVDNYGYQLSGGERRRCEIARALSVGRQGPKYLLLDEPFAGIDPLAVNDLKKLIIKLRDNGMGILITDHNVRETLLITSKSYVLSEGKILAHGSSDELANNQIVKKFYLGVDFQL,<ps2>,MAAKEHKSLQGSKILLIEDDKSIRLTVTESLISEGFEVSNFKDGSSALDFILGEGIKDFDLILLDLMLPGLNGLELCRKIRNEELYTPILILSAKGNESDRVLGLEVGADDYLTKPFGISELIARCRALLRRSKRGKEKKQKIETIIEYKNIKMFTEECRVTNFNQEIILSPKEFKLLELFIKNPKRVWSRDLILEKIWAIDFIGDTKTVDVHVRWLREKLEENPSAPKIIKTVRGFGYRFG,<
|
| 47 |
+
<ps1>,MNFEIKNVFLTIEGKSIVNDVSIKVCPGEIVGLMGPNGAGKTSTFNLAVGNLRPDKGDILINSKSIKNLPLPIRAKLGLGYLTQEASIFRDLTVKENIDLALENSFSSRAIVRNKREKIINEFNLNKVVDNYGYQLSGGERRRCEIARALSVGRQGPKYLLLDEPFAGIDPLAVNDLKKLIIKLRDNGMGILITDHNVRETLLITSKSYVLSEGKILAHGSSDELANNQIVKKFYLGVDFQL,<ps2>,MNFEIKNVFLTIEGKSIVNDVSIKVCPGEIVGLMGPNGAGKTSTFNLAVGNLRPDKGDILINSKSIKNLPLPIRAKLGLGYLTQEASIFRDLTVKENIDLALENSFSSRAIVRNKREKIINEFNLNKVVDNYGYQLSGGERRRCEIARALSVGRQGPKYLLLDEPFAGIDPLAVNDLKKLIIKLRDNGMGILITDHNVRETLLITSKSYVLSEGKILAHGSSDELANNQIVKKFYLGVDFQL,<
|
| 48 |
+
<ps1>,MLLSKLVDLIKSGESKFIKANIFENIDIENAASIDIALKNQISFLEENNILKDNLGKTSASAIITSNNNEILGLLESLNISNIVVENPRIAFAEVLNFLYEEINFNPGIDDSAVIKSSAKVGKNCYVGPNVYIGENSIIGDNNKIFPGTTILGNVRLGNNNVIHPNCVIYENTSIENNCVINSNTVIGSEGFGFIPQDGKWIKMPQKGCVIIKSFVEIGTNCCIDRPSVGNTFIDEGTKMDNLVQIGHGVKIGKNCAFAAQVGIAGGAVIGNSVILAGQVGVNNRVKVGNNVIASSKCGIHCDIEDGEVVSGFPAMKNKSWLRSSSVFKKLPELAKKLRQLDKK,<ps2>,MLLSKLVDLIKSGESKFIKANIFENIDIENAASIDIALKNQISFLEENNILKDNLGKTSASAIITSNNNEILGLLESLNISNIVVENPRIAFAEVLNFLYEEINFNPGIDDSAVIKSSAKVGKNCYVGPNVYIGENSIIGDNNKIFPGTTILGNVRLGNNNVIHPNCVIYENTSIENNCVINSNTVIGSEGFGFIPQDGKWIKMPQKGCVIIKSFVEIGTNCCIDRPSVGNTFIDEGTKMDNLVQIGHGVKIGKNCAFAAQVGIAGGAVIGNSVILAGQVGVNNRVKVGNNVIASSKCGIHCDIEDGEVVSGFPAMKNKSWLRSSSVFKKLPELAKKLRQLDKK,<
|
| 49 |
+
<ps1>,MSSNFKNLYTSNNPPLEMILMRGSKLESIHKVHAVISDKKGRVLMCAGNPEYKSFIRSALKPFQAIPFVSSGASSKIKNSSKSIALSCGSHSGSKLHAREAFKILWEYNIDIHNLKCPIKKTSPLEHNCSGKHAAFLATCKKLNWPLETYLKGDHPLQVEIFRIISELLEIPLEQIYAERDDCGAPTLYMKILEMAKLYSLLSSSDNAELEQISRAMTINPTMISDHNRFDTEVIQASHGHVISKGGAEGIQCFCKVNEGMGLALKVEDGSKRAKQSVGLHILKQLEWISELRIQDIEDKIIKLPEGVQIEVKGQLKFQES,<ps2>,VPNNQNRDNFIDKAFTVIAESIVKIMPIADKEKKAYIYYRDGLAAQNNGDYSEALDYYNESLLLEENKIDRGETLKNMAIIYMSNGEEDRSIETYQKALEENPKQPSCLKNIGLIYEKRGRFAEQNGDLDQRDMWFDKAAQVWSKAVRLYPGGYLDIENWLKTSGRSSIDIYL,<
|
| 50 |
+
<ps1>,VRIIFWGTPEYSVKSLEVLKKSDHDIVAVITQPDKKRSRGNKLISSPVKEYATKENIPVFTPETIKENIQFISILNDLSCDLFIVIAYGKILPKAILDIPKYKSWNAHASLLPRWRGAAPIQWSILEGDKITGVGIMRMEEGLDTGDVLVEKQIKIENNDNLKTLTKKLSDLSSELFLRAISDIEQNKNRDINLLLKKQTDFKRELKYARMINKLDYIINWENSATDIYRKINALYPRANTTYKRKNLKIIKIKILTTHEIHNKNYKILSNVFKPGLIIGLIKNVGIIITTKTDPILLLEAKLEGKKVSSQNQLIQQLNPVIGENFSD,<ps2>,VRIIFWGTPEYSVKSLEVLKKSDHDIVAVITQPDKKRSRGNKLISSPVKEYATKENIPVFTPETIKENIQFISILNDLSCDLFIVIAYGKILPKAILDIPKYKSWNAHASLLPRWRGAAPIQWSILEGDKITGVGIMRMEEGLDTGDVLVEKQIKIENNDNLKTLTKKLSDLSSELFLRAISDIEQNKNRDINLLLKKQTDFKRELKYARMINKLDYIINWENSATDIYRKINALYPRANTTYKRKNLKIIKIKILTTHEIHNKNYKILSNVFKPGLIIGLIKNVGIIITTKTDPILLLEAKLEGKKVSSQNQLIQQLNPVIGENFSD,<
|
| 51 |
+
<ps1>,MRNEKYWVKALDQTHLSITNNGLFPLKTTVVTREYYNKNDFIIRELDTSRFTKKNNYGPNQNPFNPWDKILEVDKVGTNHQLILNKYPVQKGHILLITNTWRPQDGWLDINDWIAIQMVNEDTSGLWFFNSSPIAGASQPHRHFQLLRRDHGEIICPREKWFLDFENNNDQDSKLKKNTVVSKFNFLNNSINIYNLYLELSNKIGLGNPIDDEKPRFPYNILITNNWIAIIKRKYDHVHGFSVNGLGFAGYLLVTEKSNINYLKKYGPEKLLENFV,<ps2>,MRNEKYWVKALDQTHLSITNNGLFPLKTTVVTREYYNKNDFIIRELDTSRFTKKNNYGPNQNPFNPWDKILEVDKVGTNHQLILNKYPVQKGHILLITNTWRPQDGWLDINDWIAIQMVNEDTSGLWFFNSSPIAGASQPHRHFQLLRRDHGEIICPREKWFLDFENNNDQDSKLKKNTVVSKFNFLNNSINIYNLYLELSNKIGLGNPIDDEKPRFPYNILITNNWIAIIKRKYDHVHGFSVNGLGFAGYLLVTEKSNINYLKKYGPEKLLENFV,<
|
| 52 |
+
<ps1>,MTTSSKKDYLSILGLSSKFDDIELKKAFRREARKWHPDLNKNDINAEDRFKLINEAYEFLRDPVRRVKSIDSNSSNEEIYNKYSTGFPEFKDYLNSLFGFEYESELDNESYDQTSDFYEDEKPNAIFNEEEFNSYDYPARSPEEPPPVKLHQDIETIIELTPDEALSGASILIELEDQTVVEVDTPPFAGDGWRLRLENIAKGGKDHYLQLKVQTENGLRIDGLRVLYKLELFPPDALLGCAVEVPTLDGNVTLQVPPKSSTGRLLRLKGRGLSFGDNIGDQFVEILVVIPADINDEEIALYTRLQELSLSDE,<ps2>,MTTSSKKDYLSILGLSSKFDDIELKKAFRREARKWHPDLNKNDINAEDRFKLINEAYEFLRDPVRRVKSIDSNSSNEEIYNKYSTGFPEFKDYLNSLFGFEYESELDNESYDQTSDFYEDEKPNAIFNEEEFNSYDYPARSPEEPPPVKLHQDIETIIELTPDEALSGASILIELEDQTVVEVDTPPFAGDGWRLRLENIAKGGKDHYLQLKVQTENGLRIDGLRVLYKLELFPPDALLGCAVEVPTLDGNVTLQVPPKSSTGRLLRLKGRGLSFGDNIGDQFVEILVVIPADINDEEIALYTRLQELSLSDE,<
|
| 53 |
+
<ps1>,LKKTFKVTITNKETGKIYQENISDQEYILKEFEKKGLRLPFSCRNGCCTSCAVKIISGKLDQPEAMGVSQDLKDKGYALLCVAKVIEDIEVETTYYDEVYDLQFGQYFGKGKTRKAPPWEFEED,<ps2>,MKGHKKIRFIFPLVAMYVPLLLLAPKAIAGSFGAEIFCTMRDGGNDHESSWQAAYSYIKKQKGGIFKTSPKQAAGQIIETVVRERDKFSYCVEFLDQLHPDRKLQLENDRKEKRRKKEELLQDKENEDYSKETFDRYSY,<
|
| 54 |
+
<ps1>,MDFKTYQKQARLTAQYPNLGSNNIYPTLGLVGEAGEVAEKVKKVIRDKKGIFDEESKKGIKKELGDVLWYISNLCNEFNFELEEVALQNLEKLKLRAAKGKISGSGDDR,<ps2>,MDFKTYQKQARLTAQYPNLGSNNIYPTLGLVGEAGEVAEKVKKVIRDKKGIFDEESKKGIKKELGDVLWYISNLCNEFNFELEEVALQNLEKLKLRAAKGKISGSGDDR,<
|
| 55 |
+
<ps1>,MSFFQGKILLNFIIDLLNKPAINWSNFELNSSLQLNDFVDLLLEPLNTSQYSYNIKLGLHEALINAVTHGNKLDPNKSIRVRRIITPNWCVWQIQDQGNGLEIKKRLYKLPKKFTSFNGRGLYIINECFDDIRWSNKGNRLQLALKR,<ps2>,VPNNQNRDNFIDKAFTVIAESIVKIMPIADKEKKAYIYYRDGLAAQNNGDYSEALDYYNESLLLEENKIDRGETLKNMAIIYMSNGEEDRSIETYQKALEENPKQPSCLKNIGLIYEKRGRFAEQNGDLDQRDMWFDKAAQVWSKAVRLYPGGYLDIENWLKTSGRSSIDIYL,<
|
| 56 |
+
<ps1>,MSFFQGKILLNFIIDLLNKPAINWSNFELNSSLQLNDFVDLLLEPLNTSQYSYNIKLGLHEALINAVTHGNKLDPNKSIRVRRIITPNWCVWQIQDQGNGLEIKKRLYKLPKKFTSFNGRGLYIINECFDDIRWSNKGNRLQLALKR,<ps2>,MSFFQGKILLNFIIDLLNKPAINWSNFELNSSLQLNDFVDLLLEPLNTSQYSYNIKLGLHEALINAVTHGNKLDPNKSIRVRRIITPNWCVWQIQDQGNGLEIKKRLYKLPKKFTSFNGRGLYIINECFDDIRWSNKGNRLQLALKR,<
|
| 57 |
+
<ps1>,VASTLLFTALKEAIDEEMANDVNVCIMGEDVGQYGGSYKVTKDLYEKYGELRVLDTPIAENSFTGMAVGAAMTGLRPIVEGMNMGFLLLAFNQISNNMGMLRYTSGGNYKIPAVVRGPGGVGRQLGAEHSQRLEAYFHAVPGIKIVACSTPTNAKGLMKAAIRDNNPVLFFEHVLLYNLSEELPEGDYICSLDQADLVKEGKDITILTYSRMRHHCLKAVEELDKKNIDVELIDLISLKPFDMKTISKSIKKTNNVIIVEECMKTGGIGAELIALITEECFDDLDTRPIRLSSQDIPTPYNGNLENLTIIQPHQIVEKVEEVINGSI,<ps2>,VASTLLFTALKEAIDEEMANDVNVCIMGEDVGQYGGSYKVTKDLYEKYGELRVLDTPIAENSFTGMAVGAAMTGLRPIVEGMNMGFLLLAFNQISNNMGMLRYTSGGNYKIPAVVRGPGGVGRQLGAEHSQRLEAYFHAVPGIKIVACSTPTNAKGLMKAAIRDNNPVLFFEHVLLYNLSEELPEGDYICSLDQADLVKEGKDITILTYSRMRHHCLKAVEELDKKNIDVELIDLISLKPFDMKTISKSIKKTNNVIIVEECMKTGGIGAELIALITEECFDDLDTRPIRLSSQDIPTPYNGNLENLTIIQPHQIVEKVEEVINGSI,<
|
| 58 |
+
<ps1>,MSKLSTTKICVKSPAKINLHLEIIGKRKDGYHELAMIMQNIDLSDYIEFENNQIGEIKLKSNSKDLSLDEDNLIIKAANYIKDMSKNKELGANIFLKKNIPIGAGLAGGSSNAAATLVGLNKLWDLDLDYETIFILSAKLGSDVPFFIEGGCQFCFGRGEILEKYSSNFDFGVILLKNPNISISTVDTYKKYSQEFCPKYFTETEKTNKIRNDLRVNGFNDFKLSEQRINVKNDLQVIVERENNSVKKALYLLSNLQNCLSYSMSGSGPTCFALFKDINIANEVFEQNYKMFNNNGFEAWVCKLINSGITLL,<ps2>,LVKPKSPDNKISNHLQQDVVKIAGKTIFINPFLYWRRFDENTNRWLREPGQMSEEQIQPNRNRFYPEIDWADLSQNQKLVKDASVEMFLKTLELISTFHPQLNSGQLLEVERKMAITKKLPFEKWVTKSFAKKARAEEYEKRKFKRDRFIRSWKEWLSLENTQQALLPIIVVVFVSAFIGWSSGVSKNSCNPYFEQNLDQSI,<
|
| 59 |
+
<ps1>,MSKLSTTKICVKSPAKINLHLEIIGKRKDGYHELAMIMQNIDLSDYIEFENNQIGEIKLKSNSKDLSLDEDNLIIKAANYIKDMSKNKELGANIFLKKNIPIGAGLAGGSSNAAATLVGLNKLWDLDLDYETIFILSAKLGSDVPFFIEGGCQFCFGRGEILEKYSSNFDFGVILLKNPNISISTVDTYKKYSQEFCPKYFTETEKTNKIRNDLRVNGFNDFKLSEQRINVKNDLQVIVERENNSVKKALYLLSNLQNCLSYSMSGSGPTCFALFKDINIANEVFEQNYKMFNNNGFEAWVCKLINSGITLL,<ps2>,VPNNQNRDNFIDKAFTVIAESIVKIMPIADKEKKAYIYYRDGLAAQNNGDYSEALDYYNESLLLEENKIDRGETLKNMAIIYMSNGEEDRSIETYQKALEENPKQPSCLKNIGLIYEKRGRFAEQNGDLDQRDMWFDKAAQVWSKAVRLYPGGYLDIENWLKTSGRSSIDIYL,<
|
| 60 |
+
<ps1>,MKNLKLKVIFKYLKPYKKEFLYGGIALLVVNILSILIPLEVKNIIDQLKDGFSSSFVISKSLFLMFLATCMGLIRLFSRQIVFGIGRKVEVNLRQKLFDHLLIQDPDWIQKKGSGDIISRATSDVENIRRLLGFTVLSLCNIVLAYSLTIPSMLSINKTLTVAALMIFPMILVIVSLFGGRMVSQRKIQQESLSKLSDLIQEDLSGISAIKIYAQEEAEKKQFNNYNKVYRNSAIKLARTASTLFPLLQGISSISLLILLGLGTSQLENGFITIGGLVALILFVERLVFPTALLGFTLNTFQLGQVSLDRVEEIFQNNPKITDKPKAKFIKKKVKGTIEAKNLKIKYEGAKFNSLNRLNFKINPGELIAIVGPVGCGKTTLAKSLGRTIEIPDGQLFLDDIDITNIKLRDLRKHIAIVPQEAFLFTSTISENLKFGDPKASRNVVKNSAVNAGLIDDINSFPDGFKTIVGERGITLSGGQRQRTALGRALLVDASVVVLDDALASVDNKTAAKIIEEMRANKSKTILMISHQLSVAATCDRVLVMDQGKIVQEGIHKDLITTNGLYKNLWEREIATNKIVS,<ps2>,MKNLKLKVIFKYLKPYKKEFLYGGIALLVVNILSILIPLEVKNIIDQLKDGFSSSFVISKSLFLMFLATCMGLIRLFSRQIVFGIGRKVEVNLRQKLFDHLLIQDPDWIQKKGSGDIISRATSDVENIRRLLGFTVLSLCNIVLAYSLTIPSMLSINKTLTVAALMIFPMILVIVSLFGGRMVSQRKIQQESLSKLSDLIQEDLSGISAIKIYAQEEAEKKQFNNYNKVYRNSAIKLARTASTLFPLLQGISSISLLILLGLGTSQLENGFITIGGLVALILFVERLVFPTALLGFTLNTFQLGQVSLDRVEEIFQNNPKITDKPKAKFIKKKVKGTIEAKNLKIKYEGAKFNSLNRLNFKINPGELIAIVGPVGCGKTTLAKSLGRTIEIPDGQLFLDDIDITNIKLRDLRKHIAIVPQEAFLFTSTISENLKFGDPKASRNVVKNSAVNAGLIDDINSFPDGFKTIVGERGITLSGGQRQRTALGRALLVDASVVVLDDALASVDNKTAAKIIEEMRANKSKTILMISHQLSVAATCDRVLVMDQGKIVQEGIHKDLITTNGLYKNLWEREIATNKIVS,<
|
| 61 |
+
<ps1>,MAKSSWEGNCFLNFFNNKSSSGKDDKTIFKSKFTSPYKLLKCSYDQEGRCILPILHTAGGLVGGDLLEFEANIGINSKVLLTTSSAQKVYGSVGRSKINPEGTFSSQKTKISILDNSHLEYLPQETIVFANGLYSQEFNIKISDNSSFLFTDLIRLGRSSAGESIESGVFRSKLEIMRNGNLCDDWEFVDQIELTKFSFEAKSGMDFKPVFGSLIWICEKEFPITKISYLKEKIKIIFKENNNYLSLGTLENGLSIRFLGTSSQDARKCFFSIWTQIRTVCGFCKPEYQGVWPLQDL,<ps2>,MAKSSWEGNCFLNFFNNKSSSGKDDKTIFKSKFTSPYKLLKCSYDQEGRCILPILHTAGGLVGGDLLEFEANIGINSKVLLTTSSAQKVYGSVGRSKINPEGTFSSQKTKISILDNSHLEYLPQETIVFANGLYSQEFNIKISDNSSFLFTDLIRLGRSSAGESIESGVFRSKLEIMRNGNLCDDWEFVDQIELTKFSFEAKSGMDFKPVFGSLIWICEKEFPITKISYLKEKIKIIFKENNNYLSLGTLENGLSIRFLGTSSQDARKCFFSIWTQIRTVCGFCKPEYQGVWPLQDL,<
|
| 62 |
+
<ps1>,MSSKLRVGVAGPVGSGKTALVETLCIALKKRYKIAVVTNDIYTKEDANFLIKKKILEEGRIVGVETGGCPHTAIREDCSLNKNAVMDLENKYDPLDFIFVESGGDNLAASFSPELVDLSIYVIDVSAGDKIPRKGGPGITRSDLLLINKIDLADMVGANLNIMQNDTNMMRDGKPWFFTNLSSGSGVDNVIKYLVAQIPNI,<ps2>,VNSNSSNQVGKNIRRTGFLIVLSYLLIVLIMKVLEANNFFGYSLSSFSNDIFAPPSLKHLCGTDRLGRDVCLRTLQGSSIAIEVVFLAIFFALILGLPLGLLSGYFGGILDKCLSLVMDTIFSIPVILLAVVVAFVLGKGIINASIALCIVYSPQYFRLIRNQTMLIKSETYVEAARVSGADVKTIIFKYILPNVITPLPILLTLNAADAVLVLGSLGFLGLGVPANVPEWGSDLNLALAAIPTGIWWTALFPGLAMFFLVLGLSFIGEELENIFEN,<
|
| 63 |
+
<ps1>,MSSKLRVGVAGPVGSGKTALVETLCIALKKRYKIAVVTNDIYTKEDANFLIKKKILEEGRIVGVETGGCPHTAIREDCSLNKNAVMDLENKYDPLDFIFVESGGDNLAASFSPELVDLSIYVIDVSAGDKIPRKGGPGITRSDLLLINKIDLADMVGANLNIMQNDTNMMRDGKPWFFTNLSSGSGVDNVIKYLVAQIPNI,<ps2>,MTKFKLKIASRRSKLAMVQTLWVKEQLEKNIPDLEVSIEAMATQGDKILDVALAKIGDKGLFTKELEAQMLVGHADIAVHSLKDLPTNLPDGLTLGCITKREDPSDALVVNKKNKIYQLESLPPGSIVGTSSLRRLAQLRYKFPHLDFKDIRGNVITRIEKLDSGEFDCIILAAAGLKRLGFESRVHQIIPNEISLHAVGQGALGIECKSDDKEVLKIISVLEDKVSSQRCLAERSFLRELEGGCQVPIGVNSSIQNDEIALIGMVASIDGKRLIKNESIGNIKYPEEVGKKLAEKLKLQGADKILSEIFEQFRDK,<
|
| 64 |
+
<ps1>,MSSKLRVGVAGPVGSGKTALVETLCIALKKRYKIAVVTNDIYTKEDANFLIKKKILEEGRIVGVETGGCPHTAIREDCSLNKNAVMDLENKYDPLDFIFVESGGDNLAASFSPELVDLSIYVIDVSAGDKIPRKGGPGITRSDLLLINKIDLADMVGANLNIMQNDTNMMRDGKPWFFTNLSSGSGVDNVIKYLVAQIPNI,<ps2>,LGRSRKTNQLIREFLSELKEVLTKDGSITLRSLIFQENFHSLEGALKETEIKFIYPSDLKRFKDKSLNVLDICFGLGYNSASLFNNVIRQNSLINWYALEIDKKPLEYSLGNKSFQKLWHPKVFKILKALLKNSKYKDQFFDCDILWGDAREKIKNIPANIKFDLIYLDGFSPQKCPQVWSVEFLSKVTQKLNPQGYLITYSCSAAIRSTLKDFGLNIFNNKPNLVSKNLWSYGTIAVKNIDEKVLQNNLYLKKLSWMEEEHLLTKASIPYRDPTLNSNPKDIIKKRVQEQFLSNLKTSKKWRDKWGMTK,<
|
| 65 |
+
<ps1>,MSSKLRVGVAGPVGSGKTALVETLCIALKKRYKIAVVTNDIYTKEDANFLIKKKILEEGRIVGVETGGCPHTAIREDCSLNKNAVMDLENKYDPLDFIFVESGGDNLAASFSPELVDLSIYVIDVSAGDKIPRKGGPGITRSDLLLINKIDLADMVGANLNIMQNDTNMMRDGKPWFFTNLSSGSGVDNVIKYLVAQIPNI,<ps2>,MIDSFPLIKKEHIETLQINIGLKCNQACKHCHVNSSPLRSEKMSYEIISLIPKVIEKYKIKTLDITGGAPEMHPEFRNLITTLSDKNIDIIDRCNLTIFFEDGFEDLPQFLAKNNVIVTASLPCYEKDNVELQRGYGVFDKSINALKILNDLGYGKQKDGLQLNLVYNPVNPILPPSQVILKEDYKRILFEKYNISFNNLYTITNMPINRYADSLNSENKLDSYYKLLKENFNKNNLEKLMCKKTISVNWQGQIYDCDFNQQINLKGNKGPKTLSDLMSKSFKFDYGVAVKEHCFACTAGAGSSCGGTLT,<
|
| 66 |
+
<ps1>,MSSKLRVGVAGPVGSGKTALVETLCIALKKRYKIAVVTNDIYTKEDANFLIKKKILEEGRIVGVETGGCPHTAIREDCSLNKNAVMDLENKYDPLDFIFVESGGDNLAASFSPELVDLSIYVIDVSAGDKIPRKGGPGITRSDLLLINKIDLADMVGANLNIMQNDTNMMRDGKPWFFTNLSSGSGVDNVIKYLVAQIPNI,<ps2>,MKLRLFEFYFIKDYLRPWFGLIYSLFFLFFLGAIGYRITEGWDWGDCLWMVLITITTIGFGEVQTLSPEGRIITVLIIVGGLIFIQFTFQKAVRLFESGYFQRVNELRFKRLLRKMENHVILCGYGRVGQEISNQIKTQNIPIIVVESDEDRKKIAEDNGLEVLCADATLDETLKLAGLDKCKSLVVTLPNDAANLYVVLSAKGIRSSIRVIARAGTEEAASKLRLAGASIVVSPYIAAGRAMASMALRPIAIDFLDLLAGSECEIEEFELSNDISLFETAEKITLLELGIGKKSGAKILAIKEDEKLITNPGGDFLLQPGQVLIAFGSKEQLTTLNRLLGNLVVSVELLK,<
|
| 67 |
+
<ps1>,MSSKLRVGVAGPVGSGKTALVETLCIALKKRYKIAVVTNDIYTKEDANFLIKKKILEEGRIVGVETGGCPHTAIREDCSLNKNAVMDLENKYDPLDFIFVESGGDNLAASFSPELVDLSIYVIDVSAGDKIPRKGGPGITRSDLLLINKIDLADMVGANLNIMQNDTNMMRDGKPWFFTNLSSGSGVDNVIKYLVAQIPNI,<ps2>,MQKTKFSKINDQFNNLLFGFLSSSWKSKSINVISVLTGYFLFANFATKFISEGKNELIMVPIIILIIELIIRIRPPAGSSFFNLWSIIDKARIGATYAVILEAFKLGS,<
|
| 68 |
+
<ps1>,MSSKLRVGVAGPVGSGKTALVETLCIALKKRYKIAVVTNDIYTKEDANFLIKKKILEEGRIVGVETGGCPHTAIREDCSLNKNAVMDLENKYDPLDFIFVESGGDNLAASFSPELVDLSIYVIDVSAGDKIPRKGGPGITRSDLLLINKIDLADMVGANLNIMQNDTNMMRDGKPWFFTNLSSGSGVDNVIKYLVAQIPNI,<ps2>,MSKLSTTKICVKSPAKINLHLEIIGKRKDGYHELAMIMQNIDLSDYIEFENNQIGEIKLKSNSKDLSLDEDNLIIKAANYIKDMSKNKELGANIFLKKNIPIGAGLAGGSSNAAATLVGLNKLWDLDLDYETIFILSAKLGSDVPFFIEGGCQFCFGRGEILEKYSSNFDFGVILLKNPNISISTVDTYKKYSQEFCPKYFTETEKTNKIRNDLRVNGFNDFKLSEQRINVKNDLQVIVERENNSVKKALYLLSNLQNCLSYSMSGSGPTCFALFKDINIANEVFEQNYKMFNNNGFEAWVCKLINSGITLL,<
|
| 69 |
+
<ps1>,LSRLLISFIFFAIVFLSPLSTFASHTSDPTVSLLQSRISKNFSKKFCNAIQNGLSKDEAMTSAIVKTENIVSFSYNPQKKWIEKEDLANQISIKVINDCGWSFGLIGKEGIDYFNSYFLEIYDKTTPDKKLSS,<ps2>,LSRLLISFIFFAIVFLSPLSTFASHTSDPTVSLLQSRISKNFSKKFCNAIQNGLSKDEAMTSAIVKTENIVSFSYNPQKKWIEKEDLANQISIKVINDCGWSFGLIGKEGIDYFNSYFLEIYDKTTPDKKLSS,<
|
| 70 |
+
<ps1>,MTFEAKYLGSNGWLIKFDKTNLIIDPWLTGDLIFPPGEWFFKGSLDNEILIEEDINIILLTQGLPDHCHVPSLKKFKKDIDIICSNSAKGILEKLGFTSIKVLKPKEKIMQKELEIEATAGAPVPQIENGYIVKDYKGKGFYIEPHGYLDENVNSQELDAVITPIINLELPLVGSFVKGADVLPKLIKTFNPKYILSSTAGGEAKYTGLLNKFISVQEYAEEVKCNLVNLKTMDSVKI,<ps2>,MTFEAKYLGSNGWLIKFDKTNLIIDPWLTGDLIFPPGEWFFKGSLDNEILIEEDINIILLTQGLPDHCHVPSLKKFKKDIDIICSNSAKGILEKLGFTSIKVLKPKEKIMQKELEIEATAGAPVPQIENGYIVKDYKGKGFYIEPHGYLDENVNSQELDAVITPIINLELPLVGSFVKGADVLPKLIKTFNPKYILSSTAGGEAKYTGLLNKFISVQEYAEEVKCNLVNLKTMDSVKI,<
|
| 71 |
+
<ps1>,MPIVFAWSLCLSVVVVLLSTIPLTLGRIKAGYSVENMSAPRALFDKLPDFGKRAVWCHQNCWESISIHAPACILCLITLPDSNLSLIAAWMHPLLRFLYIGAYVLNIPIARGLIWASGIFTTLVLYKEGISQFM,<ps2>,MPIVFAWSLCLSVVVVLLSTIPLTLGRIKAGYSVENMSAPRALFDKLPDFGKRAVWCHQNCWESISIHAPACILCLITLPDSNLSLIAAWMHPLLRFLYIGAYVLNIPIARGLIWASGIFTTLVLYKEGISQFM,<
|
| 72 |
+
<ps1>,MTDIEEIKKKIYQIAAITDRGQRLNKLIAPMYQEKLKEMGNLIDILESFNTEVSEEKLSGEWELIYSTVELFRSSPFFLAIEKALNDEFKSNLFFKLHQLQVGSFGLSTIGRIAQNIDFDKKEFLSTFDTTIFGLTIIPILGWFKLLPTFGGRVITLADDLILEDKVLKMNLKKTKVSKVDGLNKIPLFSTLLMERWYPVKEVWEKLPWNKESPSCEVSVIYLDEEVRVMKDIYGSTFVYIRPTISLLNSK,<ps2>,MTDIEEIKKKIYQIAAITDRGQRLNKLIAPMYQEKLKEMGNLIDILESFNTEVSEEKLSGEWELIYSTVELFRSSPFFLAIEKALNDEFKSNLFFKLHQLQVGSFGLSTIGRIAQNIDFDKKEFLSTFDTTIFGLTIIPILGWFKLLPTFGGRVITLADDLILEDKVLKMNLKKTKVSKVDGLNKIPLFSTLLMERWYPVKEVWEKLPWNKESPSCEVSVIYLDEEVRVMKDIYGSTFVYIRPTISLLNSK,<
|
| 73 |
+
<ps1>,MLLSRVAESLYWINRYLERAENISRFVEVSEAMSLDCPPGSAEPWLPLIDASSDRETFDSRFPEKKQDDVINFLIRDRINPNSIISCIQLARENARQIRDVMTSEMWEQINILYWNLQEGESIWDLPRQEQLSEIRRGCQLFYGITDATLSKDLACQFSILGRLIERADKTSRILDVKYYLLLPSLDELGGVLDELQWIALLRSAGAYQMFRKAEQNSIQPNSVARFLLLDNNFPRSVRYCLDGISNTLKMIDTSPSSDNPSKLECMRGLLKAKWSYIRIEDIINDGLHEAIDSLQIDLNKLHNLIEDKYFINKEFDQ,<ps2>,LVKPKSPDNKISNHLQQDVVKIAGKTIFINPFLYWRRFDENTNRWLREPGQMSEEQIQPNRNRFYPEIDWADLSQNQKLVKDASVEMFLKTLELISTFHPQLNSGQLLEVERKMAITKKLPFEKWVTKSFAKKARAEEYEKRKFKRDRFIRSWKEWLSLENTQQALLPIIVVVFVSAFIGWSSGVSKNSCNPYFEQNLDQSI,<
|
| 74 |
+
<ps1>,MLLSRVAESLYWINRYLERAENISRFVEVSEAMSLDCPPGSAEPWLPLIDASSDRETFDSRFPEKKQDDVINFLIRDRINPNSIISCIQLARENARQIRDVMTSEMWEQINILYWNLQEGESIWDLPRQEQLSEIRRGCQLFYGITDATLSKDLACQFSILGRLIERADKTSRILDVKYYLLLPSLDELGGVLDELQWIALLRSAGAYQMFRKAEQNSIQPNSVARFLLLDNNFPRSVRYCLDGISNTLKMIDTSPSSDNPSKLECMRGLLKAKWSYIRIEDIINDGLHEAIDSLQIDLNKLHNLIEDKYFINKEFDQ,<ps2>,MKLRLFEFYFIKDYLRPWFGLIYSLFFLFFLGAIGYRITEGWDWGDCLWMVLITITTIGFGEVQTLSPEGRIITVLIIVGGLIFIQFTFQKAVRLFESGYFQRVNELRFKRLLRKMENHVILCGYGRVGQEISNQIKTQNIPIIVVESDEDRKKIAEDNGLEVLCADATLDETLKLAGLDKCKSLVVTLPNDAANLYVVLSAKGIRSSIRVIARAGTEEAASKLRLAGASIVVSPYIAAGRAMASMALRPIAIDFLDLLAGSECEIEEFELSNDISLFETAEKITLLELGIGKKSGAKILAIKEDEKLITNPGGDFLLQPGQVLIAFGSKEQLTTLNRLLGNLVVSVELLK,<
|
| 75 |
+
<ps1>,MLLSRVAESLYWINRYLERAENISRFVEVSEAMSLDCPPGSAEPWLPLIDASSDRETFDSRFPEKKQDDVINFLIRDRINPNSIISCIQLARENARQIRDVMTSEMWEQINILYWNLQEGESIWDLPRQEQLSEIRRGCQLFYGITDATLSKDLACQFSILGRLIERADKTSRILDVKYYLLLPSLDELGGVLDELQWIALLRSAGAYQMFRKAEQNSIQPNSVARFLLLDNNFPRSVRYCLDGISNTLKMIDTSPSSDNPSKLECMRGLLKAKWSYIRIEDIINDGLHEAIDSLQIDLNKLHNLIEDKYFINKEFDQ,<ps2>,MSKLSTTKICVKSPAKINLHLEIIGKRKDGYHELAMIMQNIDLSDYIEFENNQIGEIKLKSNSKDLSLDEDNLIIKAANYIKDMSKNKELGANIFLKKNIPIGAGLAGGSSNAAATLVGLNKLWDLDLDYETIFILSAKLGSDVPFFIEGGCQFCFGRGEILEKYSSNFDFGVILLKNPNISISTVDTYKKYSQEFCPKYFTETEKTNKIRNDLRVNGFNDFKLSEQRINVKNDLQVIVERENNSVKKALYLLSNLQNCLSYSMSGSGPTCFALFKDINIANEVFEQNYKMFNNNGFEAWVCKLINSGITLL,<
|
| 76 |
+
<ps1>,MTEVINNIPDFEKYLTDTKKVVEEALDFSLGPENPEILRESMRYSLLAGGKRIRPILCLASCSLAGGEPSLAVPTAVAIEMIHTMSLIHDDLPAMDNDGFRRGRPTNHKVYGDAIAILAGDALLTRAFEMVSLRSPGVDSNRLLNVVGELSLVAGAPGLVGGQVVDLECEGKEVDLETLEYIHLHKTGALLKASVRTGAMIAGANEELLNALTTYAEGIGLAFQIIDDILDLTSSSEKLGKTAGKDLLADKTTYPKLLGMEESKKKAFDLVDQAKKAIEPWGLNAKYLISLADFITNRDR,<ps2>,MTEVINNIPDFEKYLTDTKKVVEEALDFSLGPENPEILRESMRYSLLAGGKRIRPILCLASCSLAGGEPSLAVPTAVAIEMIHTMSLIHDDLPAMDNDGFRRGRPTNHKVYGDAIAILAGDALLTRAFEMVSLRSPGVDSNRLLNVVGELSLVAGAPGLVGGQVVDLECEGKEVDLETLEYIHLHKTGALLKASVRTGAMIAGANEELLNALTTYAEGIGLAFQIIDDILDLTSSSEKLGKTAGKDLLADKTTYPKLLGMEESKKKAFDLVDQAKKAIEPWGLNAKYLISLADFITNRDR,<
|
| 77 |
+
<ps1>,VNFWGFINLKFLLDVLFALGFGLLLFSRVKEQRTLWLLRGYLLLVSFAWFIQRYAYLPLTSKLIDAVVLACSLSLAILWQGELRRLMELLGTGRLAVLLGNPPKEFRATSTTVNQLVDAAGKLSQNRKGALIVVDLGSDLRPEDFLYSGIKIEAKLSTDLLINLFATDTPLHDGAVLVKGNKIISAGVILPLSRQGISRYGTRHLAALGITERFDRCICIVVSEETGTLSLANQGKLERPITSSRLQELLIKLVGNQNTSGTPKSSSNKTNSYQKTNTNDTITVEKKLDKQNTIQD,<ps2>,VNFWGFINLKFLLDVLFALGFGLLLFSRVKEQRTLWLLRGYLLLVSFAWFIQRYAYLPLTSKLIDAVVLACSLSLAILWQGELRRLMELLGTGRLAVLLGNPPKEFRATSTTVNQLVDAAGKLSQNRKGALIVVDLGSDLRPEDFLYSGIKIEAKLSTDLLINLFATDTPLHDGAVLVKGNKIISAGVILPLSRQGISRYGTRHLAALGITERFDRCICIVVSEETGTLSLANQGKLERPITSSRLQELLIKLVGNQNTSGTPKSSSNKTNSYQKTNTNDTITVEKKLDKQNTIQD,<
|
| 78 |
+
<ps1>,MENPTKNKIQNLIDLNPVMVFMKGTKLMPQCGFSNNVVQILNSLGVTFNTFDVLSDFEIREGIKEYSEWPTIPQVYLKGEFLGGSDILIEMYNAGTLKEKIEIALAS,<ps2>,VNSNSSNQVGKNIRRTGFLIVLSYLLIVLIMKVLEANNFFGYSLSSFSNDIFAPPSLKHLCGTDRLGRDVCLRTLQGSSIAIEVVFLAIFFALILGLPLGLLSGYFGGILDKCLSLVMDTIFSIPVILLAVVVAFVLGKGIINASIALCIVYSPQYFRLIRNQTMLIKSETYVEAARVSGADVKTIIFKYILPNVITPLPILLTLNAADAVLVLGSLGFLGLGVPANVPEWGSDLNLALAAIPTGIWWTALFPGLAMFFLVLGLSFIGEELENIFEN,<
|
| 79 |
+
<ps1>,MENPTKNKIQNLIDLNPVMVFMKGTKLMPQCGFSNNVVQILNSLGVTFNTFDVLSDFEIREGIKEYSEWPTIPQVYLKGEFLGGSDILIEMYNAGTLKEKIEIALAS,<ps2>,MVMNVSIVIPTYNRKPILEKCLKALEKQNLNENISNYEVIVVDDGSTDGTTYWIKDNYEVLPHVVLYEQEHGGPALGRNLGVMKSKYEIIIFIDSDLIVLDDFIACHVNKLLFSWSKNTKKCFTYGSVINTSNFSNPESERYKLTDFSFAYFATGNVAISKELLLSVGLFDNSFSLYGWEDLELGERLKKLGTKLIKCPEAVGFHWHPPFDCGQIESLVSQEKERARMALIFYKKHSNLRVRFMIQLTPIHILLWQIICLGGLISIKRLLPLLRFLIDSGRNRIALEIVRIPLNLIYVKELRRLI,<
|
| 80 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MKVNKKYKGLVTKKFNEFYLVELDKYETSVANKKFLCKIKKSVNFRNQFVFVGDEVIVYQIDLQSKRATIESLVKRNNLLERPSVANISNIYVICSVEEPKLNLSQVNKFLISSEQLGVEVSLVLTKCDLITEEKRLLLIEKFHQWGYQAITLNLNNPENLRTLLIELKKKKCSIFMGPSGVGKTTLLNMIIPNLDNKTAPVSSKIKRGKNTTRNVELFSLSSKSYIVDTPGFNIQTLEIDIRELSNLYPEIYKQVVNEGIHCKFRNCLHVNDEGCKLNKNFERYTFYKEMVESSKSHYCLIQED,<
|
| 81 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MAGFGLPNFGQLTEAFKKAKEIQQNAQKLQDELESMEIEGKSDDEMIKVWISGNQLPLRVEVNENISTANKEEIEKNILEAIKKAHESSTTTMKERMNDLTGGLNLNLPGLDNNDS,<
|
| 82 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MASQISYRGNKNPIKKKLSFFEGGHQLEKLEFALAVAQTKGDEQKSLVLMKKIIELGGNVEEPGT,<
|
| 83 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MITLYQFRHSAFCLKTRMALHAKKLQYRVEEVTPGLGQFEIFKISGQKQVPIIVDDNDQIISDSTIICEYINKKNDNNPLFPKDPLLFAQCKLIEDWADTTMASTCRKALIKSAIENPQLRTALLPDEIPSSVKGLVDKLPFKNLSKISNVVFSTKDNLELQKILEALSKALINKKYLIGDNLSIADIAISAQLSLLKFPKSSGPILSGEGCQEYINNPYLENIFIWRNNIEEYLFSANSQ,<
|
| 84 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,VLIIYRSNSLTAKEASIFCNKTLKERNIKSKRIESDFDNNQLENYFYNLAALPDLVIVLGGDGTVLKSANALVNYDIPILSFNIGGNLGFLTQEKDFLFDQSFIKILEKEEFIIDFRNRLHCDVYSNEKNRERKILKSYDALNDFYFKSVEEDISPTNQIQIEIDNEKVNEYKGDGLIISSSTGSTAYSMAAGGPIVHPSINAFVINPICPMSLASRPIIIPDTSKVVIRVVQKNKREIKLWKDGSKCMTIKENDYCEINKVTKPCKMIKFNKSISYYITLIKKLDWKGDLSLKNNQNN,<
|
| 85 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MSKKRKRISRRRLAGQRVMAHVPIYHIETGKHKPVTAARRFIAENALSAPSVFNVRRNEHTTDRFFWGQKGLFSAQYAEENHFLFPSLKVVVEGIGEEKIFEGLELTADDWEEIEEYEYAFV,<
|
| 86 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MSKIKENKEQFWLEKFDCFSVTGKDSKRFLNGITTGNIVDLNNKVLKSCWLSPNGILKSLLEINCSEKELKVIVLVGNTSEIRKYFNDIIFPSDDVSLSDSFSINRLQQVDDMNSWRITQPIFLKNEDKKYDFYKNNPNSMNTNDLQLWKINQAIPSLNSEINGKNNPLELGLTDLIDFNKGCYLGQETMSKIKNVSSLKQEIRVWTAKDKDVNLESVNKILFNNQNKEKSVGYITSIYVLESRIIKGLAMIKRKYLDKGNPFFSDNFGQISLEKSVGSTFL,<
|
| 87 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,VENSINISILIPLIPMGMALLILSLLVSFNRTINRLTKPVSALAVFSLLSSALISAFLYFKKIEGEIFLSDYLKLFGSTNLILHLNSLTEKIVIFFAVIIAIVIGVLFYKLPRRKGYVSLIIGISLISSSIMFAVFFLDFSFLI,<
|
| 88 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MKRKEDSKNNNYDSMSFTDHLEELRQRLLNSIYSILICIFFSFLIIKPLISFLEIPASDIHLLQLAPGEFLFVAIKVAGYSGIIVSIPYIFYQLILFISPGLTKKEKNLILPAVFGSGLLFFLGLIFSWWILVPAAINFFINFGADIVEPTWSIERYFDFVLLLMSSTAIAFQLPVLQFILGSLGIITTEKMLSNWKIVVISSAILSAVITPSTDPLTMSLLSISIIFLFFVGAGLTYISESLKSKTLSSSH,<
|
| 89 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MTINEKIISDKELKISDWELDFYSRPIIETNGKKRWELIISSSKSFKTEKIFLWNKVCPANEVNSIWLTKALNEALNDAEIEGWAKPLKIRFWRASMKSIIKKSIENIGIEALVSRRTYELFDRIEFLEREIYPLEQGYVRGVLAPTFTSNILNDPKPLPEAVRGDALTISEISIEELKLAKNWPIEFGDIFPIQSSIKNDNLVPGLRLFSKDRSLALAAWFSSLEPVKLLIKQNQLILEASEDDKWLVTDLQEKDAKVLNDKFTQSKKDSYGYQFISIQATPFIEKFAGFWILKDVELIS,<
|
| 90 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MESIFNNSFATLVAYVGIVSIYLLVIPLILFYWMNNRWNVMGKFERLIVYGLVFLFFPGLILFSPFLNLRLRGDSKG,<
|
| 91 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MTNKKRILSGVQPTGDLHIGNWLGAINNWVELQEKHETFLCVVDLHAITTEYDTKQLSKNTLSTAALYIACGINPKICSIFVQSQISAHSELCWILNCMTPINWMERMIQFKEKSIQQGNNVSIGLFDYPILMAADILLYDADYVPVGEDQKQHLELAKDIAQQRINAKFGKEENILKIPQPIIMKKGSKIMSLNDGSKKMSKSDINEGSRINLLDTPEIITKKIKRAKSDSYMGMEFNNPERPESRNLLMIYSLLSGKEVSELENDLSQTGWGTFKKIFTEQIIESLKPIQERYQVLINDPHELNKILIQGKEKAEVVANKTLSRVKSELGFFEIEK,<
|
| 92 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,LALIIPSNYHKISDVEKNHISWIEPDLAERQDIRPLRIGILNIMPLGKQYEFNLLHPLGLSPLQIEPVWIKLKTHSYKTWDLNHLNNLYTTWEEANDPEPLDGVIITGAPVEHLAFEEVKYWDEFVNITNEARNSCASTLGLCWAGFALAYLAGVNKTVFDKKLFGVFPLKSLAPGHPLMGTQDDEFICPQSRFAGLPDLEMEEAQKEGKLNLLAYGKDVGYTIFETKDQKQLMHLGHPEYTVHRIISEINRDKEKGDVPPPENFDINSSNTSWRSHRNLLFQQWLWFCYQQVSLS,<
|
| 93 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MHSKINYFLGIFLSIVILIFNEPSFAINNPNLLPEEKTPVIDLAKTLSPNQKKSLEENLNNLEKESGWKIKYLSQFESVPGIAIKDYWDLDETSLLVIADPRGGNLLNFNVGEAYFAFMPRLFWVELQTRFGNQYYVKDHGEDGAVLDAINSVKICLDRGGCQVVPGLPKEQYIWTLCTSILGGLVAGFAAAPRKEGQIISIGFLALLSPLWGMLFGIFGLAPIISRTSEVLPLFKNGLAFAAAAIAGYLLSQTVFSRYEKPKKS,<
|
| 94 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,LTNYTHYTTVVVHLYYLLMTLGGANVWSNFSYGSRVDSPNGWILNPQGSFLILFENCKKSARNNINVYTHLLFTNHLGEPAGLKNTRLHDLDSAFETWNELIAGGWTEVTNQFQESA,<
|
| 95 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,LKEDSSYLIKYTSSGLYCELADTWIDPIKPVKRALITHAHMDHFTFGCDEYISTYETAVIIKERIGKEINIKTYDYEKEFKINGIKISFHPSGHILGSSQIKFSLAEEIWLITGDFKRQKDETCKEYEIVKTDYLISESTFGLPIFKWDEPQKTASDITKWVNSSQEKTSILFCYSLGKAQRLLNEISKTNFINNIYTHSSIYRMNNCYKKLGIDIIETTKLEQTKNNSDLKGSLIILPPALNKSSSLKNFKDIQTGFASGWMSIRALRKRSGYDKGFSISDHADWIAILKTIKESKAKNVFFHHGESEALNKYLKEKNSINVLEFEFKK,<
|
| 96 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,MNFKNHHQKKRFGQHWLVNNLILEKIKEVAELEEKDFILEIGPGRGALTSKLLDSKISRLHAIELDEDLIDLLNNKFRNDKNFSLQQGDILSTNLDSINKKITKVIANIPYNITGPILDIFVGRLGIISKNNYNKIIFLMQKDVVDRILAKDGNTNAGAMSVRMQLISNIRRICDVPPSSFDPPPKVFSTLVVFEPLRPEMRLDIKLEKYLDKLLRISFNSRRKMIRNTLNSILSAEEIEKLSESSQICFNSRPQDISINKWIKLAEACIKITNKNQ,<
|
| 97 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,LSRLLISFIFFAIVFLSPLSTFASHTSDPTVSLLQSRISKNFSKKFCNAIQNGLSKDEAMTSAIVKTENIVSFSYNPQKKWIEKEDLANQISIKVINDCGWSFGLIGKEGIDYFNSYFLEIYDKTTPDKKLSS,<
|
| 98 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,VNKKKLYLANPYGFSKQTKNLLPEFIKIFQNLNVEVYEPFERTKHLITNKNNWAYDLAKANFNDLKSCDCIFAIVNGNPPDEGVMVELGISIALNKEIFLFRDDFRNCSDSDQYPLNLMLFVGLSKESWSKNYFESIEDILNPKKSFLNWAKRI,<
|
| 99 |
+
<ps1>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<ps2>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<
|
| 100 |
+
<ps1>,MANSQVTTESGGRQNMFPSETRPYIDESVSYDSYPKNAEKVNGRWAMIGFVALLGAYVTTGQIIPGIF,<ps2>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<
|
| 101 |
+
<ps1>,MLNLIKKNLNIKSGIALIVLATIFVFLSNSFKKNKSKDISNFVVSVEKGILSESINTSGEVKATRTSNIGPRKQGILEEIKVEEGDLVEKGQILATLDDEDFIYKLEELELNLKKQKSEYLRREFLFKEGAVSKEDYESYKNKYNTSEAKFSDAKAEKDFYSIRAPYPGKITAKYAEIGSYVTPSSNLSSNSKAKNFIFELSEGLEIIAKVPESDIGRIKTGQEASVRIEAYPSNKYRAIVKKIAERAVKDNNVTSFEVTLKFKEISEEIKIGMTADLEFKVKSSEEKILVPTVSIVTEKGEKGVLKVDKNNTPKFEKIEIGISSGNKTSIIEGLRPGEQIFIDIPPWANKRK,<ps2>,VRIFMKLFKSLLVAPATIGLLAPFSTFAGEANLNDISKYSNLEHLDLANAFVNDEPKNNSLLAGGEGLVDSGSSDGGFSQTTTASFSVDAVLGAIDGNASATTGQGEETGFDFQFNIGLSTSFTGEDSLDIAIDNGSATASPIGAKMGFDTGTSLVVDGVTYSFPVGGATMVVGDATDVSATYTGACTYSAFTDTTLDDCGTGNSIGAGGKGVAASLGYAFDSGFSIAGGISSPTTEIVGDDADLYGLNVAYSTDSYGVAVGYAIDDGGTGAETTTWGLNGFYTFDLASLSVGYETSETGGTDSSGYFVGLSFSEVGPGSVNVGAATTGLFADSVTEYLIYEASYSYPVNDAMTITPGIFIEETAGDDLTGVAVKTSFSF,<
|
MED4_100_RRS.csv
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<ps1>,LSKRNPIILIHGLWNTADIFSSITSKLDEIGIEYFSPTLKHEYGMTSIVELTNLLNYLILEKYGYEKELDILGFSMGGIIGRYWIKKLNGYKRTRRFITIGSPHNGTLSSQLIPKYPFKGISEMKINSPLLRELSRSDYLLSGIDCISFFTYWDLMVFPGWRACLNSGEKISLKIYKHKNLVRNPDAVDKIIEKLLN,<ps2>,VRNSPFLPNRPLKVAVLGSSGAVGSELLKILEERDFPISELVLLSSQRSEGKIVKWKGEEIITKKASKEEFLNVDLVLASAGGSISKQWLSTVKDQNAVLIDNSSAFRLENDVPLVVPEVNACEALKHNGVIANPNCTTILLTLVLAPLNKISPIKRVIVSTYQSVSGAGQLAMEELQFLTKKYLQGDPKESEVLPYSLAFNLFLHNSPMLSNNYCEEEMKMTNETRKILNITDLKLSSTCVRVPVLRAHSESVNVEFDDVIKPSYAINQLKKAKGLEIIEDYEKNRFPMPNDVMGRDNIAVGRIRTDISNSNGLELWLCGDQIRKGAALNAVQIAELLIAKK,<
|
| 2 |
+
<ps1>,LTNYTHYTTVVVHLYYLLMTLGGANVWSNFSYGSRVDSPNGWILNPQGSFLILFENCKKSARNNINVYTHLLFTNHLGEPAGLKNTRLHDLDSAFETWNELIAGGWTEVTNQFQESA,<ps2>,MLRKLIHPILILPFCLYINSQEALLSKANNSIEEILHENENQIFLNYSDIDNITLKNNRELKALESLVNSTMFTLSSKIAKRYPSLDLQASGLPKYTSGKNYNSSSSTTKTSQFSANPSLNIKLDLIDPLRGSEIKIARNNYAIAKNNYEIKKKDLIKEAKSRYHKLQKSYQDIKNKTLSLDLSITSLKDAQSKFDAGIGTKFEVLEAEAQLSRDMQSLNEKKIQNQINKIELKEILNINGDFEINQKQKLIGFWNHKLNKNITEGLANSLSLKNINLKKSIKENQAKNYLNVYKPNVYISNNFTSSFSKGDSLSVKIDPEKSGSSYTNTVSLNFSWNIFDGGQNKNLYKSSKADVKSEDYSYKNIENVLKTNISKAYLNLKLNEEKILSSLKEISSTEESLRLARLRYDIGISTLKDVLVRQKELSNANSKKIDSIYNYNLNLDELVRLTFLEISNICNEENNLIKNEIQSICNI,<
|
| 3 |
+
<ps1>,MNQFFSRRSFILIPIMSILKFILQPKKVLAAFAASDDDWNLSKEDWKNKLSPESYYILREEGTERAFSSQLNNEKRKGIFYCAGCNQPLFTSDTKFDSGTGWPSFWDPIQGSVETKVDFKLIVPRTEYHCSRCGGHQGHVFNDGPLPTGKRYCNNGLALKFIAE,<ps2>,MTNTKVSNNNPDKESIINKSITKAKDNEIIKNKTIQNKKVNSVSKEPNKSVDDISNELFSELISKKISLVQEIKDLETKKNELEKDIESNFKGQSDNIAKRVKGFQEYLTGSLQNLSQNVEKLELVSQPIVVKPSPLDEKKEASNKNELLTVPALSETFKPDEQLIRSCFSNFIEQPDFYSEPWKLRRSLDSSDIEVMDDWFFNMGGRGSLESRGSRQKNALLSAGFIAILGELYGDQFQTLILASQPERLGEWRRVLQDSLGLTRDDFGPNSGIVLFERPEGVIERADRLEANEELPFIIVDAAETSVEIPILQFPLWLAFAGSNDEIYDDLELN,<
|
| 4 |
+
<ps1>,MVKEDPVRLELSITPSYGKNPVIVGIVESLDLVARRDREGRMPRDLQGTWDWTVRHGKVSTGGWNPMLKEALQTMFETGLPSIIYEELTGDEYKPVDGIRHVR,<ps2>,MSEYRDSSSNNFLSLISGAFIGAAGLAWWLISEADKRKEEKKQKAMMYSSRIQDGSEAIDTNENIKDVEGDKLEQKVEELNSAIADVRRQLEELGQ,<
|
| 5 |
+
<ps1>,MSLTQSKEVNSLSRRYSTYIERRITRTVMVGDIAIGSDYPVRVQSMINEDTMDVDNSYLAIKRLHEVGCEIVRLTVPSLAHAKAVGDIKEKLIKNNIDTPLVADVHHNGMKIAMEVAKHVDKVRINPGLFVFEKSDPTRTEYTDTEFETIKKTILKRFTPLVEVLKSENKALRIGVNHGSLSERMLFTYGDTPLGMTESAMEFVKICDELDFHNIIISMKASRAPVMLAAYRMIADRLDAEGYNYPLHLGVTEAGDGDYGRIKSTAGIGTLLAEGLGDTIRVSLTEAPEKEIPVCYSILQSLGLRKTMVEYISCPSCGRTLFNLEEVVDKVRKATSHLTGLDIAIMGCIVNGPGEMADADYGYVGKGKGTIALYRRKEEIKRVPEDEGVDALIRLIKDDGKWIDP,<ps2>,MSKVELISLTPEAEKTMAYIARVSNPSNQANDKFAGLLRYCIKHEHWSVFEQSCMTLKIETNRGIAAQILRHRSFTFQEFSQRYAETSLLGNEIPIPNLRRQDQKNRQNSIDDIPDELKIKFSEKISKHFQEANKLYEEMLNEGIAKECARFIMPLATPTRIYMTGSCRSWIHYIQLRSKEGTQKEHMEIAEDCKKVFIKYFPSVSEALNWE,<
|
| 6 |
+
<ps1>,MKKLFLLSLLISLISPIKTSAGFPEGEKGYDLKKIEDSFKLPCDEIGNDECIARAFGVGACTWVFGIKNGKDSKEALRIADGVLIALLKGNNLDINSIFEKDGSIKETIQKESVYRINFCKDATKLAIPKLIKKLPEGVELDDERIENLADVFPLQYLTMFEQMRKRN,<ps2>,MKAAILVNQKKKLIVDELDLPTSLKVGQVLVKLEYSGICGTQIGEIDGVKGEDKFLPHLLGHEGSGIVEKVGPGVKTVREGDSVVLHWRQGNGIQSEPPKYNWNGKTVNAGWVTTFNTKAIISENRCTKIPANISKEDAALFGCAVTTGFGVIENNAKLKMGESIVVFGAGGIGLNIIQAARLTSAWPIIAVDLFDNRLDLAKKLGATHSVNSSNKSYLDEIENILKDRELDVFIDNTGNTSIIEMGYNLISDQGRLILVGVPKTGENINIFSLPLHFGKKITGSFGGECNPAKDIPRFIKMMQNGLWDLKGLITESYDLENINEAIFSMRTGKTSGRVIIKL,<
|
| 7 |
+
<ps1>,MNNVVQNKSKIFYQLQKLRRLAQPFFLPIDQCNGFQFIWLLISLLFCVGGIVLVALTGIISFFESIQPIFLDKYFGGVVNTVNTIWSGWWGLLFSGLFLIGSGSFFSLRRQLKNRRWVHWLFLAVIVLMLLAVNGINAGIGFIARDLTNALVEKQQDGFYRILGIYACCFAVALPIRVSQIFFTYKLGIIWRDWLSKSLVKDYMTNKAYYQLNPNDEEQTDVDNPDQRITDDTRAFTGQSLSFTLGVFDALLTFSLNILILWSISTTLTFSLFGYAAFATAILLIAGKNLVKIDFDQLRYEADFRYGLVHIRDNAESIAFYSGEKPEKSETERRLGEVVRNFNLLIIWRVIIDVMRRSINYAGNFFPYLIMAIPYFRGDIDYGRFIQASFAFGMVEGSLFFIVNQIEELAKFTAGIGRLEGFQSKVESISQTKPIDNQNIISDYSSILINNADLFPPGSDKAIIKNLNLSIETNQSLLVVGPSGCGKTSLLRMISGLWEPNQGSIKKPKTGDLLFIPQKPYMLLGSLREQLCYPTEVDKFSDDHLISVLNEVNLNSIVDRYPNLDVKQDWPRILSLGEQQRLAFARLLLNSPRFAVLDEATSALDIKTEKRLYNLLRDRELSLISVGHRPSLKDFHENILELNGQGGWKLFTTDKYNFKN,<ps2>,MNRWVLLEHKILSSKFIDIHYDFLVEDQLDCLTWKFHEIPSLNKGVIKIVKQPNHRLVWLSRVEYQLSKNRGLVKRIDHGIFSNIPHNQDSQKLKIILNGKLLNGLFIIDGNFCQLTKNN,<
|
| 8 |
+
<ps1>,MKYLILGSGSFAGQLIFSEYLERNYDVYGFNRSRVKDHYQWPWIKKYKNDLGNRWFEYNLTNDVEEMISHINRLKPNFIIDFMGQGMVAPSWLKPEVWYTTNIAIKSRLMNALIDSSFLQKYIRIGTPEVFGSNENFLKEDECFNPSTPYAVSHAAIDFNLRCLYKQYNFPYLIGRFANFYGVGQQLYRIIPRLFLSCRSERNFILDGKGESRRSFIFSKDIVSAIDSMIKFDGIGQEFNFSSNEEISIMSLVNKICNLTNVDKSRILKFGPERPGKDRYYRLDIKKSKNVLNWEPEVSLDEGLNIINIWISENIENLSNKSWTYEYKD,<ps2>,LSLIFINLLTSIPEYISKAVETNSTIAYLTICLAMFLENIIPPIPSEIIMPLGGFFVYQQKLNFYILVFWGVFGTILGSMPWYYLGKLVNEKRLSNFLDKRGKYIGITSNDLIKSRRWFDKYGVSLVFWGRLVPGIRTLISVPAGMELMPLRKFLIWTSLGSLIWVTLLTYAGFVFGENYPIIETYLNQIKFIVKPILILIFVYFLIKFFIRLYKKKIT,<
|
| 9 |
+
<ps1>,MGENLPLLLSAALGKKVNRPPVWMMRQAGRYMKIYRDLRERYPSFRERSENPELSYEISMQPFLAFKPDGVILFSDILTPLPGMGINFEIIESKGPIIEDPIRNIRQVEKLKELIPNESLSFVGEVLSSLKKDVKNEATVLGFVGAPWTLAAYVVEGKSSKNYSLIKSMAFKEPDLLHKLLDHFAKSIGEYLKYQIKSGAQVVQIFDSWAGQLSPQDYDIFAGPYQKKVVDIVKEEFPDTPIILYISGSAGVLERMAKTGVDIISLDWTVDIEEACKRIPTGIGIQGNVDPGILFGNKDSIKERIDNTFNKVKERKYILNLGHGILPGTPEENAKTFFEHGKKLTY,<ps2>,MRGSIKRSNESYQDSYSPNGIIGEKDACGVGFIANIDGKESNWILKQSLKGLNCMEHRGGCGGDSDSGDGAGILCSIPWEFLDRELNLNTESYEKRGLGMIFMPNNELKVKESKLICDEEAKELNFKQSFWRNVPIKNETLGILAKANAPFINQWIVCLEKDDSRDIEMLLFQLRKRIEKRIRDNTKNAIGECEFYFASLSSKTVVYKGMVRSEVLSEFYEDLKKEDFKVSFSVYHRRFSTNTLPKWPLAQPMRFLGHNGEINTLLGNINWAKASEIHIDDYWGELSRDIKPIVDKNKSDSSNLDATLEINIRSGKPITDSLLKLVPEAFRDQPELESREDIKAFYEYSATLQEAWDGPALLVFADGNYVGATLDRNGLRPARYSITNDGFVIMGSETGVVDIEENRVIEKGRLGPGQMLAVDLSQNKILRNWEVKAEAAKRKNYKKLIQKRTIKLKNNEWSNTCNLKDFELLQQQTAFGFSSEDNDLILDSMASLSKEPTYCMGDDIPLAVLSSKPHILYDYFKQRFAQVTNPPIDPLREKLVMSLEMHLGERCSPFEFNGIKPFIHLKSPIINEKELISLKESEIKSKTISSLFDIEERIKGFEAKLDDICKVSEKAIKEGCSLIIISDKGVSSKQSFIPPLLAVGAIHHYLLKKEIRLKASLIIETGQCWSTHHLACLIGYGVSAVCPWLTLESGRHWLQHPKTQKLIATKKINPLSIDDVQENIKKALEDGLRKILSKIGISLLSSYHGAQIFEAVGLGSDLIKIAFDGTTSRIAGITLKELANESLLIHTKAFPEIDLKKLEFLGFVQFRNNGEYHSNNPEMSKVLHSALKQGPGYDHFETYKTLIRNRPVTSLRDLLSINSTRKSIPIDEVESVESICKRFCTGGMSLGALSREAHEVLAVAMNRIGGKSNSGEGGEDPARFNVLNDIDENTQSAILPSIKGLENGDTACSAIKQIASGRFGVTPEYLRSGKQLEIKMAQGAKPGEGGQLPGPKVDSYIAKLRNSKPGVALISPPPHHDIYSIEDLAQLIHDLHQVHPRAKVSVKLVSEIGIGTIAAGVSKANADVIQISGHDGGTGASPLSSIKHAGLPWELGVAEVHKSLMENNLRGRVLLRTDGGLKTGWDVVIAAILGAEEFGFGSVAMIAEGCIMARVCHTNKCPVGVATQKEELRKRFKGLPENVVNFFLYIAEEIRQIMSSIGVSNMEELIGNQEFLTARDIKLPKTANIDLSSLIKKGTQYKDRSWLKHSKTAHTNGYVLEDQFLSDNEFMNSIKNHGKVIKEIEIKNTDRSVCAKISGEIAGLYGNNGFNGELNLNFKGYAGQSFGAFLLKGMHIQLIGEANDYVCKGMNGGVLTIVPPQVDEKSSEQVILGNTCLYGATGGKLFALGKSGERFAVRNSGATAVTEGSGDHCCEYMTGGKIVILGSTGRNIGAGMTGGIAYILDENNDLENKVNKEIVSIHKITSLKQEEILLGILGEYLEKTKSLKASKIINNWSNFKGIFKIVVPPSEEETLGI,<
|
| 10 |
+
<ps1>,MIEKKGDNIRSENFYPDSNYYLDQDNTPEETTLPEDQIFNTKKFEWPNSYWFIAERTNGRLAMIGFMAVIINYTLFGWIAYPIL,<ps2>,VHKNKILVPLSNNSYEVIIKQGLINNIGEELIRIGINSNRKILIVSNKEISTLFGRKLLNNLKKNNFNAEIFNIKAGESHKNFASLSEIFNAAFEVGLDRNSLLIALGGGIVGDVTGFAAATWLRGIEYIQIPTTLLSMVDSSVGGKTAVNHPKGKNLIGAFYQPKAVFIDPETLITLPTREFKAGMAEVIKYGVIKDKSLFEYLENEKNRDKILNLENESLIKIINKSIKTKACIVSEDEKENGIRAILNYGHSFGHVIENLCGYGEYLHGEAISIGMKIAGDIATEKNLWSKEHSLRQDHLIESYGLPIQTPKIKKNDVMKILMGDKKVRNGKMRFILPIELGEVDIFNDINESQFLKYFN,<
|
| 11 |
+
<ps1>,MLKNDLWINQKASKGMINPFQSNLVRHLDPNNKKNAVLSYGCSSYGYDLRLSSKEFLIFKHVPGTVMNPKKFNPDNLEKTILHEDKDGEFFILPAHSYGLGVALEKMKVPENITVICIGKSTYARLGIIVNTTPAEAGWEGHLTLEFSNSSGADCRIYANEGICQLLFFEGDPCSTTYEDRKGKYQNQPEKVTLAKI,<ps2>,MKKTKVICIGEALIDRIKNKSNQEFTDFLGGAPANVACALRKLQIDSVFIGRIGSDEFGKKFINQFKELEVNINFLQLDDCLPTRIVKVNRDNSGDRYFSGFDTSLNTFFADEAFDKNEIKKDLKSLENLFSKTKYLVCGTIILSSSISADTINFLLSLANKFDVKIIIDLNWREVFWDFATSSSETNKKERVDLIRNLLNKAHILKLAKEEAILFFENKNPLEISERLLNRPDVIITDGANPICWLINGVQGTTEVSKSLKIIDTTGAGDAFLAGLISQLLSFDYPSNESEIQNCVKFASICGLLTCLGEGAIEQQPDYSKVNKFFGSQIL,<
|
| 12 |
+
<ps1>,MVSVPFSNNGSNKNFKKDFNNENAGLVPPQNIQAEEAVLGGILLDPDAIGRIADLIKPEAFYINAHQEIYKTALMLHTQGKPTDLTSMSAWLADNGSLEKIGGNSKLVELVENVSSTASIEQVANLISDKFIRRQLIRSGNEVVQLGFDQTQETNEVLDKAEQKIFEISQEKPTKGLTQAAEILTSTFNEIESRSLGTSVAGIPVNFYDLDAMTQGFQRSDLIIVAGRPSMGKTSMVLNLAKNVAQSQDLPVCVFSLEMSKEQLTYRLLSMEVGIESGRLRTGRLQQEEWPLLGEGINSLGQLPIFIDDKPNLSVLEMRSLCRRLIAEQKKELGLIVIDYLQLMEGTTPDNRVQELSRITRGLKSMARELKVPVVALSQLSRGVESRTNKRPMLSDLRESGSIEQDADLVLMIYRDEYYNPETEDRGITEIIVTKHRNGPVGTVKLLFEPQFTRFRNLAN,<ps2>,VELMGQFFSNVARYPKYLISIIAGGLVALLEPLFKNRSNPLTLVGLISSVISAFITFYFVLKAMTNPINL,<
|
| 13 |
+
<ps1>,MSFSKLVEIKNNFKFDANNSCKNLYKGACVKIKNSQKTFQVVGINPQSKVCWIREWPFALEVNKTFSLELNQITLQTYCSDTFNEK,<ps2>,MAENFSFDVVSDFDRQELVNALDQVKREISQRYDLKGTDTSLDLEKDNIFITTNSELTLNSVIDIIRQKAIKRKLSIKIFDFNSIEVVSGNKVKQTITLKKGLNQEIAKKISKNIRDEIKKINVSINGETLRVMSKSKNDLQLAIKLLENLEETYKIPLQTNNYR,<
|
| 14 |
+
<ps1>,MNKRHSLQRKTTLKWNSNGDLSEIDMLRILDRISAYELNQCELTCDLDE,<ps2>,MASQDYLIAIALIEQNNIRAMPLGGKEIKEKLEEEGNLIKLGEEVILNLLLRVFQRSDEGALKRVSEDKGLLLVHMHPKRMQKELPFIKSEWIRDGDTTQFLKYLGNLSKEIWTASLIKYKGLELVSIAKNEDI,<
|
| 15 |
+
<ps1>,MVEKFKTLFFVKSSLISLYLALTCPIPFISSEKLKIFSIITFFFGLLLIINITNDYVDTCDKKISYKTSFISKIFGKKNWEIFWKDIKLIKSLPTSQGSNIHYFISNKNESFLVPQRVENFERFVSIIEEKTKLNIDKLSYISPLWTYKLLTYMSILMIIGELIAFII,<ps2>,MSSNKICLNCGSSDLVSDRSLGGRMVCFKCGSSSFKNNSFSRIQNKKIIYLLIVLVILLIVVL,<
|
| 16 |
+
<ps1>,LTNSIKGRNSKEVTIQLKRAETQKNILIKNIYKEYETYFDIVRKSMLISAKKGIAGIYSDFSISDKALHSKELNIFLNKNISLLINSKLPFITIEQLKLGDISYPTKQLVNASVLKELVKRKEYQTVHIDHENEKTANESIEFHCDNNLNTYEYYESLSEDEISSVNLDESCYLNSFSKEISIENIEEGKRLVNAFLELIEETSDNKLIDYEKINDQAPDVFISSDNLNTFEFIDKSFSNFLLNLSYNINLELFKIELIKKIITEETFKCLSNNNSIIKHPYPFVIRYDLYPDNLYPRKNKSSDVYLFNITNVELELYNLDLSICRNNINDLKNRFKLLNKKQRYWKNKELASNSSK,<ps2>,MTDILVLILFVLSGAASGWLGVDLLPIDILKQVSNVEGFRIVLAIIGFFIGLAAGFVFLQLRKTFLDQIRTMPTDLLISRAVGLILGLLVANLLLAPILLIPFPREVFFAKPLAAILSNFFFGALGYKLADTHGRTLLRLFNPTNTDAYLVNEGIIPAASPKILDTSVIIDGRINGLLSCGLLEGQLIVAQSVIDELQTLADSSSNEKRGKGRRGLKLLKELRELYGRRLVINPTKYEGNGVDEKLLRITEDMAGTLITADYNLSQIAEVKELKVMNLSDLVIALRPEVQPGESLNIKIVREGKEKLQGIGYLDDGTMVVIDDAKKFVGERLDIVITGALQSPTGRMVFGKLINNPESNKSFKSPATQG,<
|
| 17 |
+
<ps1>,VHLDKSSSEIINKFKLSPHPEGGWFREIIRSKNHVTRNDGQKRNNITSIYYLLCKSERSKWHRVNSSDEIWIYLQGAPLNLYFLDDNKELRNIRLDLNNPIEMIPSGYWQAASSTGEFTLTSCCVGPGFDFNDFQMLRNIDPSLRPAKAIKELI,<ps2>,MFFWYRVLSLWPLRRRIINLERFRGLHDDYSKSSTSLNAIRELNDTCNVNLLCTPYVAYIPNSDYWRPNQARDLYKLHLKKSSKKLKIKFIDGSTVIDTKDIKNYAPLGPHLSKLGYQKFAELLSSHLSKKK,<
|
| 18 |
+
<ps1>,MSKVEIYTWRFCPFCIRAKSLLEKKNITFTEHKIDGDDNARELMMERANGKRTVPQIFIDDKSIGGCDELYELEKEDKLDLLLN,<ps2>,LTIYLGFLYLFFGIIFLLMPLIYIELGRPRDFIKGGLNLVIGMLLIYKQNIFNTLNYLIFSVITTLLTFYIVEIFSIRWNQLTNQEKNNLLTLEELKKNLSIFLKAISLARQDFLNSNNIFKFGRKNENLNKKKWVRNDENDNIVNSNKNNLLTLEMPKKATNKSTKDTINEGK,<
|
| 19 |
+
<ps1>,MKLSLLSAVLFLFTEISFAQEKLNYTVTSDSQIQSIKGNFEAIGNVIIKSTNNNFEASSNKLTYDKDAKTLKLVGNVFVKNLESEGLSIQKSYGDELTIFTDSGLFKFNSENKNRVKTKLKF,<ps2>,MKNLKSNKRKIHRKVAAISSIPLLITLISGTIYSFLQPLGVDAFWLIKWHTGNFGIINLQPFYSIFLGIASIISVISGIRLLQKNS,<
|
| 20 |
+
<ps1>,MVCVSNNKSYLKSQHLKIIGQKTLRGKVKISGAKNSALVLLAASLLTDEKIILDNVPLLTDIEKMGNILKNLGVKLHNKDHQLIIDSKNISIQELPYELVNGLRASFFCIGALLTRFGEASIPLPGGCNIGERPINEHINGLRALGAEIIIDRDVVKAKLVKKKTKLFGANIRLNCPSVGATETLIMAASLAEGRTVIENAAREPEIQDLCQMLNKMGAKIYDSGKEKIIIDGVHKLHGCTHKVIPDRIEAGTFLIAAAATSSSITVSPVIPNHLEAVLNKLEESGSKIIIKGNSISIKGNNIKAVDIKTAPFPGFPTDLQAPFMALMTIAKGRSKITETIFENRMNHVDLLNQMGSSITLKNNIAHINGVKKLRGMTLVGSDLRSSAALIIAALTSKSVSYVYGLEHLDRGYENFEQKLSKLGIEIKRQITKQTINKSKNRSSNSKLKEVSEIRAA,<ps2>,LELVDNINPGLVNNLLKMKSKIKQTFKLILFIFLTNTHFLQAHNLFNGGCKNHCKESVKPLIMNKELNNSSYKNQIEDDDSCLIKSLCRG,<
|
| 21 |
+
<ps1>,LIFIMIQFASFAIGGFVPSAAIAGVLVLIGLGAFFYLGLKGPTDY,<ps2>,MYSLEISLRYSPFPLSIQKKEYEDIKRIYDEIKDSMNSDNQNSPLIELSCEKVQDKLITVLAKEVISVQIYEKSAVAGGSKRPGFSLDI,<
|
| 22 |
+
<ps1>,MSGIKTKNNTQKLSFRLAPYLFIAVAIFTAFGTNGGTWV,<ps2>,MRNSWIQPRIGQKNITQMNFAKNGHITEEMNYVAKKENLPPSLIMEEVARGRLIIPANVNHVNLEPMAIGIASKCKVNANIGASPNASDINEEVEKLKLAVKYGADTVMDLSTGGVNLDEVRQAIIKESSVPIGTVPVYQALESAHGSIERLTEDDFLHIIEKHCQQGVDYQTIHAGLLIEHLPKVKGRITGIVSRGGGILAQWMLHHFKQNPLYTRFDDICEIFKKYDCTFSLGDSLRPGCLHDASDDAQLAELKTLGELTRRAWTHNVQVMVEGPGHVPMDQIEFNVRKQMEECSEAPFYVLGPLVTDISPGYDHISSAIGAAMAGWYGTAMLCYVTPKEHLGLPNAEDVREGLIAYKIAAHAADIARHRAGARDRDDELSHARYNFDWNKQFELSLDPERAKQYHDETLPEEIFKKAEFCSMCGPNHCPMNSKISDETLDELNNKLTKCDTSV,<
|
| 23 |
+
<ps1>,MKSPVLKNPNQDWHPNIWPPFTQIINSKPQLEVTHGKNALIYTKNPKQELIDGISSWWVTLHGHSNDYIADAIYHQAKTLEQVIFADFLHPQAQILSERLSGLTKLERLFFSDNGSTAVEVALKIAYQSWQNQGETRNQIIAFDGAYHGDTFGAMALGERNIFNENFDNLMFPVKRAPWPSTWINDEEVERKENNAIQILTKLLKKPTVAVILEPLVQGAGGMNMVRPEFIRRVSEVVKNNNSLLIADEVLTGFGRCGSLFAFQKANIIPDLISISKGLTGGFLPMGITLAKETIFQSFISDSPKKTFWHGHSFTANPLGCAAANASLDLLEKDPIKYLSFEEKHLSHLKKIKKLPFVKNIRVTGTIAAFDIEIGKNEGYLNNVGKRIKALSIKKGLFIRPLGNVIYLLPPLCITDRQLEKSYRIIFEILSDL,<ps2>,MKSLLNTHTKFEIMHIVLRNFKFLIFLFLLSLNLSGYSNAHMRGTFLSEEDARNRSLELGCEGIHKNQDKWMPCKNEKELHKFLNKKGSSRGNNLASSLAWIFILSSSFGILWLSIVKIKRK,<
|
| 24 |
+
<ps1>,MNNAKNLKIKQIDKKNISFKELSLIKNIIFWVDIIPGDQTQKNAIFARPFHDKNAIPQKLTGDNFYIKSNFHGYGGKSYQCIEVNDHIYLIWVDQLSKAMWLKIFKVQEKVLKNDNQYLLCDVEPRQLTESIKTNFDTSFVISKNNLLFGLCEIKHRDYLFSLNLKKTKQDIRIIKKFDNFAGNLSSNISADLFSWIEWNAGSMPWERNELFFAMIDNDGEIQNIKNFSNKFVNEEKNVSFFQPYWMSDTTLVCSEDSTGWWNLLFLDLTDIKNIILKKRIIKPLTEYGSPQWVSGISFFSGNIKNLFCVAKKDNSWVLEHYQNCECIKELKLPFCSIGDLDVCDQKLVIRGCSFGCFEELFECDFGEKSHTKLLNEISLESINEYSRPESFWFKGFNNQPTHSFIYKPLFERFIKSPLIVKAHSGPTACFDGSLNSEVQHWTSKGFTVAEVNYGGSSGFGREYRERLNYKWGILDSYDCKALVLDLIRLNLVDRTKVAILGNSAGGLTAINALCEGDLFKVAICKYPVLDLNDMHQHTHRFEKGYLNSLIGRYSKFHNEYKLRSPIYKINHLKKPVLLFHGKKDLVISCKKTLQIKEKLLKNNKNSEVIIFENEGHGFKNTNNKKQVLIKTQEFLEKTLNI,<ps2>,LSRILLLSNGHGEDLSGSLLAKYFVKKGDLVDALPIVGDGENYKKENIRIIGKTKKFRTGGIGYNSFSGRIFEIFGGQIIYFFKKLYLSYKLKNKYDFYLVIGDIVPVFFAWFAKKDFFTYLVAYSSHYEGKLKLPWPCKFFLISKYAKKIYARDFLTADDLSQQLRKKVSFLGNPFMDKFSFFENKPKIVPFNIGLFPGSRFPELLDNLKLILEVLETMSKLQYFENIAFKFAIVKALSMEEIRQILNQRKWIYIEKKGKNDGLEFTFGFITINLNWNLFEEILFESNFVISMAGTASEQAIGLAKPVIQIEGNGPQFTKSFAEAQRRLLGRYVFCSTNYINKKDQINQTINLILKVIYLIKLDKKFLVSCLDNANLRIGESNSCLKIINDIKGFHEK,<
|
| 25 |
+
<ps1>,MMIPISLQKNQNRHIIKSVRKFIDRFFKIKKNQNLNMTEQRKIEKFGVGNLYPDIHPPEFSFFKEKCIDVALGYDDGFTFTPKFGNFKETEDIFDYLKQYLEDKELEKLDIRFDTLKTCIYQINPETLELGELLECEGSDVEYFEWNKKTKSIDEVDSNSLSDEEEEYFH,<ps2>,MIKIFALTFSEIGIGKLEIFVIGIVSLLFPILFIIASRNLDAKGVFDWMMEKPNDWIGKK,<
|
| 26 |
+
<ps1>,VFDISKENFFKNLIKFPKKNIFMILLFLGFGEWFLSDLINFAGGSIGFFILCFGGYFYLKSEKPKFNEPKDLDGWIKLCNEDLDFFEEIELHNNLEKQNINRKKALELILNREKKEEIYCIGQKNFDSNATLFKNYFKEDKFKLNFMERLPKYNSSEIVPEVILNSDAILYFLKLPLSANDFLWLEKLPKNMPIWLVASFTKGLSFNNEIEEVKAQISGEYANRIIKFDKTKNSFANIPFSLRKFFISSNNNIENTKKRLLKRLHTNWQSEIEGIRRMQLNDLQRRNQIIVATSVFFSPIPSIDVLSMTVLNSLMIKEIKSIWGCNWSPEILDKVSKQIIKTAIAQGVIEWSGQTLIGLTKLHGPNWLVTGAFQAISAAYLTRVVSSSLADFMALTKGVSEPDLEFIKENSDKIVERAFENEKINWKSLIPELNIPLTRLT,<ps2>,MIKNTKKSQKNKILTLEDVSISYGTFEAVRNVFCNFKSGDITSLIGPSGCGKSTVLRALNRMNDLIPNCSLRGTVLFDGTNIYDKRVDPVEVRRRIGMVFQQPNPFPKSIYENIAFGARINGFVGDMDELVESSLRKAAVWSECKDKLNDSGYSLSGGQQQRLCIARTIAIEPEIILMDEPCSALDPISTLKIEETMHELKKNYTIIIVTHNMQQALRVSDMTAFFNAVEYEEGDGGKVGYLAEFDSTKKIFSSPKEKTTQEYISGKFG,<
|
| 27 |
+
<ps1>,MWIRFHEKNGSERLNFTAFYEALLEAKGVNLGDTGVAGIGKGGRKLSYIATVQGNGNLLIGKAYTALLDLKAGDEFEIKLGRKQIRLLPSE,<ps2>,MTISDKIRVYELSRDLKLENKDILDAAQKLSISVKSHSSSISLEDAKKIKNLINKNSSKKILSVSKSAIKAKNENPKNNDNKNNKNFSNPSHPEKLSKEGLNKKPLLIKPTNKVVNSLVSSNIKNPNPPTIVSNLKSQALSKNQNKTNTSVITTPNLKDKKNPSALQDKKPLKNSSGSPAKTTARPPIQLIEKPKNLANSNRNINANKINNSVNQKAQSLNRADNNKLSRADNNNFPKKNLNSPNVKSTPELVGAPIRREDPKINTNRPNSNSRQPSSNTQISANRPGGQNRQGVPNREGGPYRQGSPNRPGTPYRQGAPNRPGGQNRQGVPNREGGGPYRQGSPNRPGTPNRPGTPYRQGAPNRPGGQNRQGVPNREGGGPYRQGSPNRPGTPYRQGASGIRKPVAPNELMQLQKTNASNKEKPNISNVNKQKIEGANQKTKAPNSRLNTSPSPTAKKPARSFASNTKKPGRTDWDDSAKLEALRNKNPQKQRQKVHIIGENDDSLTSETSGYSGEKVSILSASLARPKKEKSEEIKSQKPSKQFKKKKKETTRQRQKRRAMELRAAKDAKQVRPEMIIIPEDNLTVQELADKLSLESSEIIKSLFFKGITATVTQSLDLATIETVAEEFGVPVLQDDVQEAAKKTVDMIETDDIESLIKRPPVITVMGHVDHGKTSLLDSIRESRVASGEAGGITQHIGAYQVEFEHESKKKKLTFLDTPGHEAFTAMRARGTKVTDVAVLVVAADDGCRPQTLEAISHARAAKVPIVVAINKIDKEGASPDRVKQELSEKDLIAEDWGGDVVMVPVSAIKKQNIDKLLEMILLVSEVEDLQANPERLAKGTVIEAHLDKAKGPVATLLVQNGTLKAGDVLAAGSVLGKIRAMVDEHGNRIKEAGPSCPVEALGFSEVPTAGDEFEVYRDEKSARAIVGDRATDARATKLAQQMASRRVSLSSLSTQANDGELKELNLILKADVQGSVEAILGSLEQLPKNEVQVRVLLSAPGEITETDIDLAAASGSVIIGFNTSLASGAKRAADANDVDIREYEVIYKLLEDIQSAMEGLLEPDLVEESLGQAEVRATFAVGKGAIAGCYIQSGKLQRNCSLRVLRSDKVIFEGNLDSLKRSKDDVKEVNTGFECGVGCDKFSTWSEGDIISAFKFVTKKRTLNK,<
|
| 28 |
+
<ps1>,MNKTITPSIETIERNWFLVDAKDKTLGRLSTEIAAVLRGKNKPTFTPHLDTGDFVIVVNAEKVEVTGKKASQKLYRRHSGRPGGMKVEKFESLQERIPERIIEQAVKGMLPHNSLGRQQFKKLKVYKGSDHPHAAQNPVLLNS,<ps2>,MKIILLVFFFIMFSFIFLKFYKYKKAFKKDKSIKFNKSNLYNWMNLTKKERFDLSKKESNSYLKKRKTLLEEIRKEYKIISKND,<
|
| 29 |
+
<ps1>,MTELNQKNSGKNIKWHNLTIDRNKLEKMRGHKGMVIWFTGLSGSGKSTLANAVNEVLHLDGFSTYVLDGDNIRHGLCKDLGFSDEDREENIRRIGEVANLFMNAGIITITAFVSPFISDRDKVRKIIGSKDFIEVHCAADIEVCESRDTKGLYKKARLGEIKEFTGISSPYEAPVNPEIVVDTGSLGLNDSVEKVINHLREQNLLERS,<ps2>,MESDNLFSNTYRIESNAPLADKLRPKNLDDFFGQESILGHDSLLRNAILNDKVGNIIFSGPPGVGKTTLIEIISSNTRSSLIKLNAVLSSIKELRTEIANAKERLRSSNRKTILFIDEVHRFTSVQQDALLPSIENGTITFIGATTENPFFAVNKALISRARIFSLLPLNKNDLKKIIDKVIKYYSCLKDSKVVEIKEEAINHLIKFSGGDARNLINALELGISITKENKENLVVIDLAIAEDSIQKKNIVYDKNGQNHFDVISAFIKSIRGSDPDATLYWLANMVEAGEDPNFIFRRLLISACEDIGLADPNAIVVVQSCCDAFDRVGFPEGLFFLSQASLYLAISPKSNSTKSIFKALEAIKATNVSLVPNHLKNNASNYLNPHNYQGKWLQQEYLPTDLQGIKFWKPKDSGWEKNKYEDLPKKQKS,<
|
| 30 |
+
<ps1>,MSASKREEVSSHLRYIRLELREMHQMLIRDDLLPDLSEAKEVHAQLDALYELLSDKRKKKVKNEFENF,<ps2>,MEAFHPPKEVKETIDDSGLPKEEGISEKWLREKIDSLIPLIQEKWPNIAQQTLETAKGSIDDLVGVIASHTGSSASGIKNQLFQIIDSIQENNWEIADKIEPIESQLEELLDELNSTLRPKIETPIRKKPILSIAIAAGIGLFIGSLINSRNK,<
|
| 31 |
+
<ps1>,LNYWIQNLAPDGSPDEIGVIQLAWLGDSVWELHQRLRHIHIPLKSRDLHLSVVNEVKAQAQSKALDEIEHLLNSFEINLIRRARNKTKRFPKSSDPAIYSRATGFEALVGWLFLKDPKRLSKFFEYLECK,<ps2>,VTQIDSKKKFDRLRLCKLLETIYKEHTTEELNLICNQLLQILDNFSEKSRYEEISEDKKWDESFAVLITYADGVYKKGETTLVTLRELLSKNFGSLSKVVHILPFLKSTSDGGFAVSSHTSLEEKFGSWEDLKSISNKHYLMADLVLNHVSSSHPWVQQFIKCQEPGLSNVFSPSQDLDWKNVIRPRSSSLFSQINTDDGQKQVWTTFGPDQIDLNWLNPKMTIEFLNLIITYLSNGIKWLRLDAVGFIWKEPGTTCLHLSKAHSIVKILRILLNDLLKDGVLITETNVPQKENLSYLLPEDEADMAYNFPLPPLLLEAIISSRADILNAWICDWPELPKTTTLFNFTASHDGVGLRALEGLMNEQRIKDLLINCEKRGGLVSHRRLSNGEDKPYELNISWWSAMEDPGRDSNRYQYERFLLTQLLVMSLKGVPAFYLPALLASENDIKSFSMTGQRRDLNREKFKSEKLAAVFNNPESNANKNLKYLRHAMDVRAKLPQFHPQSHMECLSKNRADIVALKRGIGSKAVFTIHNMTENKINYRFIDYEFNKLIKNDLNMQDYLTSNKYNSNNIELDPFQVIWLGF,<
|
| 32 |
+
<ps1>,MTSSKPKKSSRVRKTTKNSKKNHNTMMPLLPKTPPSFKNKVVDKKALKNLVSWAYKTHGTAVTAAMADNLKDLGFKYATQAAVSISVNDLKVPEAKQDLIGQAEAQITATEECYRLGEITEVERHTKVIDTWTETNERLVDAVKNNFNQNDPLNSVWMMANSGARGNMSQVRQLVGMRGLMANPQGEIIDLPIRTNFREGLTVTEYVISSYGARKGLVDTALRTADSGYLTRRLVDVAQDVIVREEDCGTERSIVINSEDGKFGSRLIGRLSAEDILDSEGNLIVPKNTAIDPSLSKTLETSLISKVNIRSPLTCEANRSVCRKCYGWALAHNHLVDLGEAVGIIAAQSIGEPGTQLTMRTFHTGGVSTAESGVVRSKIKGKVEFGSKAKIRGYRTPHGVEAKQAEVDFLLKIIPTGSITNKAQKIEVTSGSLLFVEDGQDIDSDITVAQITSGAVKKSVEKATKDVICDLAGEVRYDKVIQPKEVTDRQGNITLKAQRLGRLWVLAGDVYNLPPNAKPVVSTETKVEQGTVLAEASQSSEFGGEVRLRESVGDSREVQIVTTSMLLSNFKLIEESTHSGELFHLESNDGTIYRLNTSPGSKISSGEVIADLADERFRTKTGGLVKYAPGLSVKKARSSKNGFEVSQGGTLLWIPQETHEINKDISLLMTEDMEWIEAGTEVVKDIFSQTSGIVTVTQKNDILREITVRNGSFHECEDEEILSRFTEEGKLVNPGEKIIDGVDNDEILFVQKLETSKGKGLLLRTVEEYTIPNEAELPELSHVKQEKGPSLALKAIQRLSYKDGELIKSVEGVELLKTNLSIESFDATPQMTIDVETIQDKSDKSINRLNLVILESILVRRDTISDSSHGSTHTELQINNNQLVKAGDVIATTQILCKERGVLQLPDSVEGEPIRRLIVERNEDKIKINIKDKAVVKTGDRVVDGDLISKGVKSTSCGEIEEVSSEYVILRIGRPYMVSPDSVLHVKDGDLVLRGDGLALLVFERQKTGDIVQGLPRIEELLEARRPRDSSILCKKSGVVQIKEGTDEESVSLSVIERDDSISEYQLLMGQNIMVSDGQQVTGGELLTDGPINPHDLLDCLFTDLKDQKPLMEAAQESISKLQRKMVNEVQNVYKSQGVAISDKHIEVIVRQMTSKVRIEDAGDTTLLPGELIELRQVEDTNQAMSITGGAPAEFTPVLLGITKASLNTDSFISAASFQETTRVLTEAAIEGKSDWLRGLKENVIIGRLIPAGTGFSGFVEELASEAGPHPDILAEESGGYRRTQNLRPDYTVDMPQTPIVSSTAILDDPSDEDLETTRNRHGIDPTSSNFAAFARPNAENQFSEDQLPDPAALEGLQEEGLLSDG,<ps2>,MQNPAEKKSSILKDFKNLFIWIIIALIIRWQVIEPRWIPSGSMLPTLQIQDKILVEKLTPKITSKSNLSKLKNKIIVFNVPEQLIDAGYESDIALIKRVIGVPGDKVEVKEGNLYLNDIAQNNYISDKNINYSTGPYYVPEKSLWVMGDNRNNSMDSHIWGFLPYEKVIGKAIFRYWPLNNIGPIRFPSLNNLG,<
|
| 33 |
+
<ps1>,MEDWQEWEYFDYHGELRSKRTKICITCTHFRYSTTDQCVTILTCPFHQKLIPQGDHLVKGCTYWRKDSRIFAPEAA,<ps2>,VRFHIQQEIDIPASTQLYNQICFAIAARYYPPGHRLPSTRQLAMQTGLHRNTISKVYRQLETDGVVEAIAGSGIYVRDNLKKSFNSKNNLNTTPALETKKAVDKLIKLGCTLQETRNLLTNEIDWRIKCGSRIIVSTPREDIGASMLIAEDLSPNINVPVEVIPMEELEKVLCNSNNGTIVTSRYFLQPLEKLAKQYRVRAIAVDLSDFQKELKIIKELKPGSCVGIVSISPGLLRAAEIIIHSMRGSDIVIMTTISDNSNRLLALLKASNHIVCDGPSLSVIENTLLKNRSQLMRVPQIICAKNYLSIKTINHLKTEIGVIN,<
|
| 34 |
+
<ps1>,MKVIVIDDDPTGSQTVNNCLLLLKWDYSTLIKGFQSKSNLFFILANTRSLSENDAKLRLVEICNALKKVISKESYKEEFIFVSRGDSTLRGHNFLEPKIMNDCLGPFDATFHIPAFIEGKRKTIDGEHFVDNVPVSQTIFAKDKIFGYKTSNVKQLLFQKCKSQIKFNDIQNLKISELKVLESKEKNIVFNKIRNLKENSHVIVDIENYSQLQKFSLSIKKLSKQKKFLFRTAASFISSISAVKDNPKEPFFYSLIRRKNREKKFLPGFLVIGSYIELTTMQLKEFLEISDCIPIELDVFEFLKISKLKSNQDQLEVFKNKLLAQIRSILKQENTPVLFTSRKEVSLARNDEQVNFNNSLAHFISELVSDLKNEIGYLVSKGGITSNVILSNGFKANYVYLQGQIITGVSLVTFKLENDENLPIVTFPGNIGNQDSLVKVWRILENKNNSSN,<ps2>,MRILHTMLRVGDLDKSIDFYVNILGMNLIRRKDYPHGEFTLAFVGYGSEKDNAVIELTHNWSKKSEDYELGNKYGHIAIGVKDIYDICQGLEDNGCNVTTKPKTMKNSTTVLAFVEDPDGYKIELIERD,<
|
| 35 |
+
<ps1>,MANNFYQWWKNHRRVVTFGGFLILLGLYVSPVIKEAKYKNMCIKLSEKGALNKLNGDNIGETLLKDTGLSIEELAKIEGYRNCF,<ps2>,MNDLNIEFPLDDFEELISQIGWSSLDEWFIFWNLKKEMLSINNFWDDNVKDDWIWGLALPLLSQAYKLNKKSPDRKIIGISALPGTGKTTLGKWLESISLKLKFKLSVISIDDFYLPSEEMEFAIKNNPWNVSRGFPGSHSIDLMKEKLLKWKTDGQLNVPVFDKSLRKGLGDRAHWREESPDLLIIEGWFLGVKPLSIDLDNSEKFSPPLSVFESSYRNKIQNNLDQYLDIWNMIDQIWHLKPLKFEYLNEWKSNQEKSMHFKSGSSLKGDNLSNFLRMLNVSIPHKSFDDINSDVLLMINQERKLVRVGLNQQISK,<
|
| 36 |
+
<ps1>,MKLNQFLKWHNIVSSGGEAKILINSGQIKVNGEIEKKRGRKLVKGDKVMFLKSELIFE,<ps2>,MKPQLTLQTPLELPHQEISNYLNQLWISEDEESVGANTFTLMVWQPAWLEQCLVKSGLISGPITGTLSPEIIKVAKKLIIDKGLSHTTSIHSEELLTLLKENLLNKDYEDLRGQFFESSISTLNPRRLITLAPTLNKESEIKTFVSAYCPLSDNTITQPICGDLVVIRGDSNSINNKGLKIIDDLSIKDLPIWLWWNGSLDESQEIFNFFTDQGIRLIIDSANGSPKRCLKILYQSIKSNKAINDLNWVRLKSWRESLAMIFDPPSRRPILEHISDIDIDIAEGNFLQALLLISWISDKLEWVFSKINKHGDLIKIEFKRKNGENILTCINPVPLGNPSIHSGQVIGLRLISKISEVRKNNTCVILGCESVECMRLEAGGMADMQLIEQVVPNSFSSSESDVSKLLGSSRGNTSPLFENAIKVAVQIFNGFNK,<
|
| 37 |
+
<ps1>,MKDKQEKIRMFLPFSWVICAVISFAYINSHLINT,<ps2>,MEFNIQDKVKLKNPLSYLKTSDNMPMLRPPDLVAIDEVGEIIAIKSPDTVEIKFRRGSFLIDTDKIEKTQI,<
|
| 38 |
+
<ps1>,MKYLILGSGSFAGQLIFSEYLERNYDVYGFNRSRVKDHYQWPWIKKYKNDLGNRWFEYNLTNDVEEMISHINRLKPNFIIDFMGQGMVAPSWLKPEVWYTTNIAIKSRLMNALIDSSFLQKYIRIGTPEVFGSNENFLKEDECFNPSTPYAVSHAAIDFNLRCLYKQYNFPYLIGRFANFYGVGQQLYRIIPRLFLSCRSERNFILDGKGESRRSFIFSKDIVSAIDSMIKFDGIGQEFNFSSNEEISIMSLVNKICNLTNVDKSRILKFGPERPGKDRYYRLDIKKSKNVLNWEPEVSLDEGLNIINIWISENIENLSNKSWTYEYKD,<ps2>,VTQRHFFVTTSSSSAAEKTLKTKIWKRVFIVCMILLISGSFFYFNHEENNTYILKTLELNGSVKEGDTLFKMNCVGCHGITARGLVGPDLQSITMRLNDAEIIKQVIEGVTPPMPSFEIDPQNMSNLLTYLHSL,<
|
| 39 |
+
<ps1>,VENDEIKYKKDIPIEWLKMPSSAKEAEKLNIKKYFIGDKRFMRELEDRDEYNAA,<ps2>,VILKTIKISNKLCLIGIIVFCLFQNHSVSASREPLIRVLISKNRNLRIRSDKSIPLIIKGQKFSNKKIKGLTVKKENNTTSLFFDKNKQKIYDLKNKVKLVVKSSDGRGIWVGQKRYSGILNLLVLESEILVINILGIEKYLSSVVGSEMPAKWPLEALKAQAIASRTYALKQKGNQIYDIDSTQKNQVYNGLESRTYKTIRAVRSTRSLVLTYKNKLINALFHSSSGGMTENSQDVWKNEYPYLSSVRDFDRNNPKLQWKKKFSSGELQKLFPEIGGIKKIEILNITNTGRVKNVQIFGKYGSDQISGVDIRKRMNLKSTFMRFKFIEDKKYISDNDNSNNPIEKTLIVFGRGSGHGVGMSQWGARYMASKGQKADRILKHFYKGVGIKPFSKNYL,<
|
| 40 |
+
<ps1>,MSIETTVLDFKLSNTFEEYQAHMNAPEQQAMFKEMGVKTFYIGKSLEDPKRATVMFQGPVNTCYDIFVNPETKPIVEASGHIYEGTIINRWIS,<ps2>,MMFNKQKKLILNLKILKLLFFSPLLISIPFYLGNSDAKAGLEFQWDQDSGYRRLKWFQKENKKRFRNTIFFFLRPSDRQANLLKITLNIPKTFDSTLKDKVSFCKVKIGGFEGRTKCIEDIPADVDINEDNSSLDIYPYSPIPSNKDSYAIVFKKISNPKKSGLKQFHSYGQYAQKNTSSRYLGSWTIVID,<
|
| 41 |
+
<ps1>,MNFITNSRQFHKSLAPWVFLPLFISALTGTFYRICKDLLGYSRDEVHWLMSLHEGEWLGDNGELIYVILNSLGLIWMLITGFQMFSKKISFPKKVTKGESKG,<ps2>,MASFTVGIVVFPGSNCDRDVSWALEGCLDIKTKFLWHESSDLNDVDSIVLPGGFSYGDYLRCGAIARFSPLINSLHDFIKSGRRVLGICNGFQILTESGFLPGALVANKNLNFICDDVDLNVITSKGGWFQKLNENQNIKLPIAHGEGCYHCDQDTLKRLVDNDLIALKYKTNPNGSTSDIAGITNEKGNVLGLMPHPERACDESIGGIDGLYTLRSLITQ,<
|
| 42 |
+
<ps1>,MNNSQRSVTHSQNGDYRTIEQTMEKLSGGTRRLAAQLTTSATFNSLWNVLTDYDRLNLYIPNLLSSRKIYKNNNNVHLKQVGAQDFLGMKFSAEVTIDLFEEKELGLLKFSLIKGDFRRFEGSWKIKKIKDTSKNSLIYDLTVQGCQWMPIGMIEKRLKKDLSENLIAVDKQAKASIK,<ps2>,VILNPELQEKGEIKDLMKSRGSFRAFPLAAITGHSLLKLSLLLAAVDPSLGGVIIAGGRGTGKSILARGLHTLLPPIEVLDNESILEKLTMSNSNTSLRPIGRNLDPDKAEEWDISTNKLLEEVIGSDYLNQIEEIPKKVREAPFIQVPIGITEDRLVGSIDVAASLSSGEQVFQPGVLAEAHRGVLYVDDINLLDDGIVNLILEATGREQNNIERDGLSLSHPCRSLLIATYNPEEGALRDHVLDRFAIVLSADQSIDNNQRVEITKSVLSHAENNIKFSEKWSEESDNLSTQLILARQWLKDVKITKEQITYLVNEALRGGVEGHRSELFAVKVAKANAALRGDENVNSDDLKVAVRLVILPRATQIPPQDDDIQPPPPQDQSPPPPQSNNEDSEPESNEKEDNQEEEQDNSDGEEDSTPDIPEEFILDPESCMVDPDLLLFSSAKSKAGNSGSRSVILSQSRGRYVRPLIPRGKVKRIAVDATLRAAAPYQKSRRLKNPNKTIIIEENDFRAKLLQKQAGALVIFLVDASGSMALNRMQSAKGAVIRLLTEAYENRDEVALIPFRGNQAEVLLPPTRSITAAKRRLETMPCGGGSPLAHGLTQSAKVAKNALSTGDIGQVIVVGITDGRGNVPLGTSLGQAEVNENENVDLKQEVLDIAAKYPMLGIKLLIIDTERKFIASGFGKELAEAAQGKYVQLPKATDKTIAAMALNAINEF,<
|
| 43 |
+
<ps1>,MSKDFKSGKVKRLPINNLNLPNFVNNSLRNNTKVNTVEGTNVIRVPFGKRFPKKQRPDKNQNIATLILPINTFINPTPPPHVA,<ps2>,MPSLSDGDFYYYRNFYIDQPPQMAQVFYESLHVISFSLKIIFYFLERLSDVN,<
|
| 44 |
+
<ps1>,MRHQLRVPLLSKPADQRKALLRALTTQLIREGRITTTKARAKALRNEAERMISLAKEGTLSARRRALGYIYDKKLVHSLFEKAQERYGERNGGYTRIVRTVARKGDNAQMAIIELV,<ps2>,MKDIFLVLDSYQYQMESNYQETSSLTNLFTENKFIGWLGLFIVFFSIFAIIIFQFLEWESNDKNKE,<
|
| 45 |
+
<ps1>,MNERNKSLWKQAIKWPLYSVAILPVFISGAYTLNSFKNVKIYNLIAFTIAAILILIWENLTNDLFDSETGIDEFKFHSIVNLVRSKTIVSITAYTSLLIGLVVIAIISISTSINVMLLVGACCFLGYLYQGPPFRLGYQGLGEPLCWLAFGPFAYAAALIALNPSDIYMISIPWKESLLLGSGSSLATTLVLFCSHFHQIKEDKEHGKNSPLVLLGAKKGAKIIPWIVFIIYVFQLFLIINGFIPILCVLFLISFPQSLKLINLLKYSYNKPEAIKNCKFIAIKFQTLNGIGLIAGFIINYLIYK,<ps2>,MKIMDNFDDDLSLKQKEFVEPIDKATNKDLFEKKDEFKEATPKVLHLNSLITKNIYLFTKDPNYKLFAWLMVQLFIFSLFVLVATLMKNNLVPYINSL,<
|
| 46 |
+
<ps1>,LLLMKRLLLAAVLFLLSEISFAKEKLNYTITSDSQVQNGKGNFEAISNVVIKSINNNF,<ps2>,MDEDSRKVTEEVWLICPNSTEVRRFTKNKNNKDKFFEYMFVDSGIIIGVLGAKPPLMKTRKEIKIEAARKEYQQLIISGWQVTIPKW,<
|
| 47 |
+
<ps1>,LDQFEVKVFIRLRPSVLDPAGEAIKSASSKLGVAGIKSLRIGKLIEVKIESNEEDIKEKIELLCDRLFANTVIEDYEYSINKL,<ps2>,MNLKQITQKDQLDLKKIYFDSIISIDQKIYTSEQKRAWASQAWDNKYFNLTLKEGKGWLINEREKIIAFASRYPNNRISLLYCRGDSQRKGYGTKLLKKIEKEAIKEGLPCLTTEASLISYKLFLKNSWKIIRKEKIIIKNITFERYKMIKNF,<
|
| 48 |
+
<ps1>,MEQGLNNPGPLTIFLVFTAGLLTSLGPCSLSLLPITIAYVGGTKNNKFKLISFSGGVIFSLITLGALSGFLGKIYGQLPSYYASLVALIAIIMGLNLLGILKFQLPNGPDLQFMEDKVPSIITPFVVGGAFGLASSPCITPVLATLLAWVSQAKNPTISIIFLFFFGLGQVTPLILAGATTENLKQFLELRKYSQVIPTLSGVFLVSLGILNLISNWI,<ps2>,MKENITELWFSWFYKNWEKNAPGNLIDKGLSPSQIAERFVNENHKEFLEIANEFDEDNYQALNEFMKLSESELHILKYFLKLIKLKNS,<
|
| 49 |
+
<ps1>,MGTANLHDSTNKPLYGERIIEESNIICFENPNKKRIYEISIELPEFTCKCPFSGYPDFAKLNIYYQPNMKVYELKSLKLYINKFRDLKISHEEVVNRIMDDLLKAAVPHWIHLNADFNPRGNVSMKLDIYSGQKRN,<ps2>,VRIIFWGTPEYSVKSLEVLKKSDHDIVAVITQPDKKRSRGNKLISSPVKEYATKENIPVFTPETIKENIQFISILNDLSCDLFIVIAYGKILPKAILDIPKYKSWNAHASLLPRWRGAAPIQWSILEGDKITGVGIMRMEEGLDTGDVLVEKQIKIENNDNLKTLTKKLSDLSSELFLRAISDIEQNKNRDINLLLKKQTDFKRELKYARMINKLDYIINWENSATDIYRKINALYPRANTTYKRKNLKIIKIKILTTHEIHNKNYKILSNVFKPGLIIGLIKNVGIIITTKTDPILLLEAKLEGKKVSSQNQLIQQLNPVIGENFSD,<
|
| 50 |
+
<ps1>,MAQLETRTEPMVVNFGPHHPSMHGVLRLVVTLDGENVIDCEPVIGYLHRGMEKIAENRTNVMYVPYVSRMDYAAGMFYEAIVVNAPERLANIVVPKRASYIRVLMLELNRIANHLLWLGPFLADVGAQTPFFYIFREREMIYDLWEAATGQRLINNNFFRIGGVACDLPYGWLEKCIDFCDWFAPKIDEYEKLITNNPIFKKRIEGLGTIERDQAINWSLSGPMLRASGVSWDLRKVDSYECYDDFEWEIASEKEGDCYARYRVRVQEMRQSLKIIRQACEMIPGGPTENLEAKRMATEDKKSEIFGMDYQYVAKKVAPTFKIPNGELYTRLESGKGEIGVFIQGNNEVTPWRFKIRAADLNNLQILPHILKGAKIADIMAILGSIDVIMGSVDR,<ps2>,MVRKISFIGVGPGDPDLLTIKALKKIESADVIFWADSLIPEKIINFSLKGSEKIKTSTLTLEKITSIMIERFNEGKTVIRLHDGDPCLYGAVKEQLEILRQENIETEVIPGVSAFQVAAAYHQAELTIPDITQTIILTRAGGRTGMPEKESLKDLAKHKSSLCLYLSARHIKSSQKTLLEFYPPETKVIVGYRVSWDDGWTSLIELKDMEKFTLEKELIRTTIYIVSPAINTIANRSNLYNPSYKHLFRGK,<
|
| 51 |
+
<ps1>,MLRPPFSQESISIDKWDVIVIGAGAAGLMTCLELPENLNVLLLNRNTSKRSSSRWAQGGIASVVRPEDSFALHVEDTLKAGDDLCDLSAVEMLVKDAPGCVDRLQNLGMIFDQSSDQLSTTLEAAHSCRRVLHVKDRTGRALVEVLEDHIENKENILHCRGVRVTELLIEKEVCKGVQVLDGSNLYWITSKAVVLATGGGGHLFTNTTNPAQSAGEGIALSWKAGVAIEDLEFIQFHPTALKFYGSPCFLISEALRGEGAVLVDKNGESPVKHLENGDLATRDQVSRAIMNNMQENDVDHVGLDLRFIDPEKIVERFPMIISRCQDYGVNPLNEVIPVAPAAHYWMGGVHTDLNASSTMKGLYAVGEVASTGVHGANRLASNSLMECLVFARKMSCIELNAPYNLRRLDRYTTEIFMDNPKEDFILGVSDKIDSLRKLCWSNLGVSRNKKNMNKLLKTLQDEIDQLQKNPLLECLNKIEIDQKLKLSEPNRRGLNLLLDLHNRQITTLLLLKACLFREESRGGHYRDDFPIKETTWKCHTRQQLNQEIIKRFIKN,<ps2>,MRTILISGANSGIGLNIAHKELKAGNRISIGLRDLESVKGSVIDPNNWTNEKILLNKYDALDKFSAKKWVENTVSKFGGFDTLINCSGVLSKVPFLYKDGDEEEILNTFNINFLAIWHLCRISWKHLSQSNNGRIIVLVSMSGKRSKGDLAAYSSSKFALMSLCQTMKNKGWEENIRVTAICPSWVNTKMAEKISSIEKSKMTQPGDIAEICSTILKLPMQSVPFEIALNCNYEI,<
|
| 52 |
+
<ps1>,MSRKSELLKGEETKNFSEFSQLADFSLMNSLNADPHSTKDGNDHRARSVNSGHYVPVTPTPIPEPIYVSHSKTLFKELGLSSDLTKDKNFCRFFSGDIEVAEYPMRPFGWATGYALSIYGTEYTQQCPFGTGNGYGDGRAISVFEGLFNGKRMEMQLKGGGPTPYCRGADGRAVLRSSVREFLAQELMHALGIPTSRSLTLYVSGTEIVRRPWYTEGSRYFEPDIMVDNHAAITTRVAPSFLRVGQLELFARRVRSNSHDDAFNELKIIVQHLIDRNYRDEIDPSYSFNEKVIRLANLYRGRLISLVTNWMRVGYCQGNFNSDNCAAGGFTLDYGPFGFCELFDPRFQPWTGGGEHFSFFNQPFAAEINFKMFCSSLLPLLLENKEDIEKLEKIKNDFSKFMSKEMQLMWAKKLGLEKYDETLTNELFNLMVNSKVDFSIFFRKLSHIPDNISFLKDSFYLPSSEELDKEWFIWLKKWQDCINKQGDLKEISKSMKQVNPKFTWREWMIVPAYQEAEEGNYNKIKELQTIFKNPYEEESLEIEQKYNRLRPREFFNKGGVSHYSCSS,<ps2>,MNYIQIKDLSKSYSDIKALKNLSMEINAGTLFGILGPNGAGKSTLIKILATLVEPDGGEVFVNNINLIKNPRKIRELIGYVAQDIALDKILTGRELLDFQSDLYHMNKKEKYERIKLLINQLEMNDWIDRKCGTYSGGMKRRIDLAAGLLHLPKVLILDEPTVGLDIESRNIIWQLLKDLKNDGMTIILSSHYLDEIDKLADSLVIIDDGKVIAQGTPAQLKNKLGGDRITLKVREFSNHEESKKISEILSSINGISQIIINKAQGYAINFVVDKEKDLLTKLKVELAFSKFEIFSLAQSQPSLDDVYLQATGKTLLDAEISMTGKRDLKKESKQSMR,<
|
| 53 |
+
<ps1>,MFLISAEKFSLWKKKQLSKGGDNHSLNLLLESLGGLSNIELNLLKINLEKNLNFKVNLDLIESFWDKHLNTSIPIQYLSGISFWRNLKLEVSNRVLIPRPETELIIDIISGIFKNKEEKITFVDLGTGSGAISIALALENPNWNGIATDIDKNAIKIASRNFATYSNQSNLKFYNGNWWDPLKNFKGEIDFAVSNPPYIPQDTYEVLPIEVKNFEPKLALLGGQEGLDHINQIVQNAPLYLKNKGWLLIENHFDQGEKVKKLFLENRFTSVKVLKDFSGIGRFTIGRYK,<ps2>,LFDKENLKYFLIWPMSVLLAIFFKYYGFLKPDFLLINNYLVLLLVCGPALVVTIILVFNKI,<
|
| 54 |
+
<ps1>,MKLQTQFTVPKKEFRDLDYVNKVKVLEETLKKECMDYPTKEDCLVCCN,<ps2>,MSKLKGPDGRIPDRLPDGRPAVAWERRWTEGTLPLWLVATAGGIAVIFVLGIFFYGSYQGVGAG,<
|
| 55 |
+
<ps1>,MNNKRIFHDPIHKEIIIDSDKPEELMIMQLIDTLAFQRLRRIKQLGAASLLFHGAESSRFTHSIGVFCVARKIYRKLVEINPDFSQNKFILFGAALLHDLGHGPLSHTSEVIFAHDHELWSKNLVKNYSPISSILKNFGTELPNQIGDLFKTKNLFSRPLKTLISSEIDCDRLDYLLRDSYNTGTKYGLVDLERIISALTFSPDGNIAIKPKGVIAIEHFLVLRNMMYRTIYNHRINEISTWILEKIIQIIKKDSVKKDLWIDESMRRWIFFPNQLEVKDFLANDDIVFYFHLMKWKEESFEPLKTLCKMFIDRKLLKASDISFLTKLKRLEILAFARKKCKLNNYDSEIFCGIKERSFKGFKSDNSLKIWDGTYQNLLENQSDLINTLMSSKDTSLIIYPGEFRKEIEDQIAIERANV,<ps2>,MSLKSVLKNKSLGILVHPSSLPGGSYCGTFGEGAKDWIKKLCKYKINHWQFLPLTPTDSTGSPYSSPSSFALNPWFLDINKLIEENFIISLNKKDLQSINQNEDHFDFDYANNLSKKLGEYLLFDWESQSEMRKTDFYLWNKKNTWVEDYSIFMVLREKFNMLPWWEWPLEFKQKENEFIKTWIKDKKNEILKTKLIQWHLDKQWKEIKVFAKTNGITLIGDLPFYVSRDSVDVWSNKSLFSISQNGDLLFQSGVPPDYFSSTGQLWGTPTYYWAKHKSTAFRWWRKRFKRQFELVDILRLDHFRALAGYWRVDGNAQNAINGSWINSPGKELLNLLKKDLKSDYLPIIAEDLGVITKDVEILRDNYELPGMKILQFAFDGNDNNPYLPKNIEKENWVVYTGTHDNATSTSWWDCLDITIKTHIKDKYKYSIDPSWNLMEIGMSTKANLFISPIQDILSLDDSSRLNTPGTITNNWRWKLNQTLDEIDMNLKKYSDLGNNYGRLSN,<
|
| 56 |
+
<ps1>,MSIETKKYNNLISISTELRKRIIKTSYEAKIPHIGSCLSCIELLVFLYWKELNIDPSNSEAINRDRFILSKGHGAPALFQVLGLKGFFPIERLNSFGKPGSVFHEHPPKPGYIPGIEAATGSLGHGFPMAVGMSLAKRINNLQYRTYSILSDGECNEGSIWEAAMFAGAQKLDDLTIFIDFNKWQATGRSKEVLALDPLKEKWQSFGWDVYEIDGHKFNQIDKSIELAKTNKNKPSAIIAHTIKGKGVSFMEDNNNWHYKTPNEEEFKKAFEELKN,<ps2>,MSKFSSQEIESQYNLIKTLLSDPEKYNDALDAIKKDIAHMPLELKKKLEEENITF,<
|
| 57 |
+
<ps1>,MQKKSFSISWGDTSLEMLPSKALLLPQTNELLICDVHLGKAEYFQQNGIPLTNNSDEQNLLSIKKIVENHKPYKLIILGDLFHSKYSISKSIKSKVENLSESLNIKIELIVGNHDIGCKVKNISFLEYKRSSNFIFSHEPIGKFENKILNICGHYHPKTFLKNSKDKLSFKCFAMDEKNNTLYLPAFGDLTGGYPCKNSFKKWAIISEKEIIAV,<ps2>,MKRLDLIFSERELDAIINTLEKANVPGYTVMKHATGRGPERVVTEDMEFTGLGSNAHVIVFCEQELIDQMRDNIKSDLSYYGGVAYISEATPL,<
|
| 58 |
+
<ps1>,LEELITKKLEVNDNLKSRFHNGFNIVKSTFLSSPIALRLWSSFFVILPIFVQAPWVRFAPISALCATFFILAAAFLLSRKEGDKWFIVGSLLLGVTGSWLGGCLFWGWLSAYPILHIPVEAVALPLAIVGLGTKWKIGSSFYISSLFGTAITDLTIFLTGIMDQWKEVIIADSDNAPLILQKTSENLIQFKSLSIIILAALILWFISKEIFNYATSNSINGKAFLVSSYVIQTTLIVDGIFIMLAIIQPTLSGLV,<ps2>,LNRSFYFKFSVVIISFLLVWTLRDFILLIICSLVISNVVSNLCYQIQTILKLPRFVSLLIVLVGISFMIFAISIIVLPPFIREFNEILIDIPNGLSRVNELVNSNLNKFNDLIYGKESERIVNIFDLVNDVVPIPDGATIAKAIQESFINIINLAGNLGSGFIRVIFVLVVSFMISIEPKAYKEGVLFMIPKVYRNKFRIILDKCNIALTNWTFSIVISSISVGLLSLIVLSILDVKYVVSNAIIAMILNIIPNIGPVLSGIFPISIALLDNFWKPVAVFGAYIVIQNIESYIIMPSILKKKTNLLPGLTLISQFGFTFIFGPLGLVLSLPIVVVTQVLIKELINDN,<
|
| 59 |
+
<ps1>,MSQIFTWIWVSSGILLILLVLLHSPKGDGMGGIAASGSSMFTSASSAEASLNKITWTILIIFLSLAIILSAGWI,<ps2>,MSKLKGPDGRIPDRLPDGRPAVAWERRWTEGTLPLWLVATAGGIAVIFVLGIFFYGSYQGVGAG,<
|
| 60 |
+
<ps1>,MGEAKRREELGLPPREKKEAKKDSKSNLNQILNKYPFAPYILGISLLTILIIDLVNYYK,<ps2>,MASNKDNQLVEKNDDNLGVENISNNPSIQSEQKLEVTEDEISFKEEDLDNGFACFGFNKLILNSLESKGYKTPTPIQKAAIPELMLGRDLLGQAQTGTGKTAAFALPLIEKLENNKESNAKVLVMTPTRELATQVADSFKSYSAESTNLRTLAIYGGTDFRNQISSLKRKTDIVVGTPGRIMDHIRQGTFKINNISCLVLDEADEMLKMGFLEDIEWIIDKLPENKQMVLFSATMPNEIRNIAKKYLNEPAEILIKSVKQETQLITQKYINVQRHHKLDALKRILEITNEGVIIFVRTKLLTTSIAEALENSGHSVAVLNGDIPQNQRENTVDRLKKGFIDILVATDVAARGLDVERIKLVINYDFPFDKETYTHRIGRTGRAGRSGEAILFVNQREKHFLRNLENSTRNKIEEIEIPNNKIINEKRMGKLITNLNESSLDQENNEEKKALMIDILDTLREKHSMEDSNIAMAAINLAIGNKSFFINEDESWLYRQNNSDRNRSNRNGNNRMRNTNRRNNYQNDSFETYKFNFGKMDRVRVANIISSICTSTNINGRLIGKIQIFNEYSLVDLPRDLHGEVKNKLKNLRIRN,<
|
| 61 |
+
<ps1>,MNSKLKFIYEGKAKKIFAYEDSDKVIIEFKDDATAFNALKKAKFEGKGELNCLISSKIFEFLIKNNIPTHYIGLKNNNSMIAQKIKIIPLEVVLRNTAYGSLCKQTTIKPGTVLESPLIDFYLKNDTLNDPLLTKDRINLLKIVDEEELDFISNMTLKINKLLKKFFYNIKLDLVDFKLEFGYNSNGQIVLGDEISPDNCRLWDLNQKNGMIVSLDKDRFRNDLGGFIEAYSEINKRINNFI,<ps2>,MGFVPLHNHSDYSLLDGASQVSKIVDRACELGMDSIALTDHGVMYGVLDLVKKCKSKGIKPIIGNEMYIINGSIDDPQPKKEKRYHLVVLAKNHTGYKNLVKLTTISHLNGMRGRGIFSRPCIDKSLLEKYNDGLIISTACLGGEIPQAILKGRIDVAENTAVWYKRIFGDDFYLEIQDHGSIEDRIVNVELIRIGKEHQIKVIATNDAHYISNMDVEAHDALLCVLTGKLISDEKRLRYTGTEYIKSEDEMLRLFNDHIDKESIKEAINNTVEVSQKIEEFELFGTYRMPKFPLKEETDSLSFLTKITKQGLLSRLNKNNLDEIDEIYKKRLTSELKIIDDMGFPDYFLVVWDYIKFARDSSIPVGPGRGSAAGSLVAYALQITNIDPVKHGLLFERFLNPARKSMPDIDTDFCIDRRNEVIDYVTNRYGEDKVAQIITFNKMTSKAVLKDVARVLDIPYGESDKLAKLIPVVRGKPYKLNEMIDKKSPSPEFRDKYLKDIKVKKWIDLALRIEGTNKTYGVHAAGVVIASDPLDMLVPLQRNNEGQIITQYSMDDIESLGLLKMDFLGLKNLTMIDKTISLIESSTGQKINIDKLPPKDNKTFDLIGRGDLEGVFQLESSGMKQVVKDFKPNSLEDISSILALYRPGPLDAGLIPKFINRKNGSEKIDFPHPFIESILTETYGIMVYQEQIMKIAQDLAGYSLGDADLLRRAMGKKKVSEMVKHRNIFIEGSCKKGVDKKIANDLFDQMVLFAEYCFNKSHSTAYGAVTYQTAFLKAHYPVAYMASLLSVNAGSSDKMQRYISNCYSMGIEVISPSINLSGIDFTIKKDQILFGLSAIKNLGDSAIRNIIDNRNKLGVFKSFSDLCDRLPSNILNKRNLESLIHCGALDEFSENNNRAQLFSDLEYVMEWASSRNRDRISGQGNLFDSISKNDTKEFSLSQGSKVEDYSLIEKLKLEKQLLGFYLSDHPLKHLAKPAKLVSPISISQLENSHDRTKVSLVGMIPELKQITTRKGDRMAIVQLEDLSGSCEAIVFPKTYCRLSEFLLTDTRLLVWGTIDKKSDKTQLIIDDCREIDNLKLLVINLDSSQASDIRIQNTIRDCLVKFKPDRDKCGIKIPVLAAVRNNDSITYVKFGDQFCVGDILGVSKLLSDKSFQVNLKSMIA,<
|
| 62 |
+
<ps1>,MNIKQPSSHKNPEPESSVLYIVGTPIGNLSDLSSRAINILKNVSLIACEDTRQTKKIMNKFEFTNNLISFNKHNSLKKIPRIINDLNSGKSVALVSDAGMPSICDPGEDLVKNVRSNGSNIICIPGPCAALTALVSSGLPSSKFIFEGFLPKKKSQREKILFEISKNEKTTIIYESPHRLKKLLNELKIYCGGEREIQVSRELTKKFEEHIGNDINNVIKTFQEKEVIGELTIVIKGIKKESNLLINKSDLKKELNELIKAGLSLSAASKYLAKKHGIKKSETYNLN,<ps2>,MSFLNNWWLIPLIITIFSGILCPAMGTVLITHRRLLQVNLISHCVLPGLALALALGIHPSIGGVISGLVGAIIAESLTNKKSENYEAVMNTILAGMLGFGVLLIPLLGIRIDLEAVLFGDLLTANLGDLLRTIIAFLTFILLVTFGYEKVVYVGLDPEGASASGINVSLLNLALSFTTALVIVSSMSAVGVILVIALLSTPTLLGLDKAQSLRIAMMRSSFFGLCISLLGFILSIVFNLSPGPAISVICVASLIIPKIGNKF,<
|
| 63 |
+
<ps1>,MAAKEHKSLQGSKILLIEDDKSIRLTVTESLISEGFEVSNFKDGSSALDFILGEGIKDFDLILLDLMLPGLNGLELCRKIRNEELYTPILILSAKGNESDRVLGLEVGADDYLTKPFGISELIARCRALLRRSKRGKEKKQKIETIIEYKNIKMFTEECRVTNFNQEIILSPKEFKLLELFIKNPKRVWSRDLILEKIWAIDFIGDTKTVDVHVRWLREKLEENPSAPKIIKTVRGFGYRFG,<ps2>,MRTILISGANSGIGLNIAHKELKAGNRISIGLRDLESVKGSVIDPNNWTNEKILLNKYDALDKFSAKKWVENTVSKFGGFDTLINCSGVLSKVPFLYKDGDEEEILNTFNINFLAIWHLCRISWKHLSQSNNGRIIVLVSMSGKRSKGDLAAYSSSKFALMSLCQTMKNKGWEENIRVTAICPSWVNTKMAEKISSIEKSKMTQPGDIAEICSTILKLPMQSVPFEIALNCNYEI,<
|
| 64 |
+
<ps1>,VARIAGIDIPREKRVEIALTYIYGVGLTRSKLILSNTGVNPDIRVKDLSDSDVQKLRGATEDFTVEGDLRRKEGMAMKRLQDIGCVRGRRHRMSLPVRGQRTRTNARTRRGSRKTVAGRKK,<ps2>,MTLSSYRMHRIYLAATMGYGLGSDDPEEVAYYKKLRKEMDEMKKDVVKKGIPLTWDIPDGMDK,<
|
| 65 |
+
<ps1>,MENSKPNYWQNAERTNGRMAMMGFFALVVNYGLFGWIIPGIF,<ps2>,MQILIIPIGFILWYFAYESKPINNDEVTSLWEKENYVKRTKLLNILKESF,<
|
| 66 |
+
<ps1>,MDLCFLSTNITSFVADPLSHEFMRKALLMSSLVAAVCGFLSSYLTLKGWALMGDAVSHSVMPGVVVAYALGLPFSLGAFIFGVGSVALIGFVKQKSRVKEDTVIGLVFTGFFALGIVLVSKIKSNIDLHSILFGSPLGISLSDVKQTVFISLLVVILLSVFRKDLILYCFDPRHAKTVGINVLFLHYLLLTCLSLAAVVGLQSVGIVLVVAMLITPGATAYLLTDKFDNMTIISVISAIISSVFGIYFSFWFDLETGGSIVLVQTFIFLFAFLFAPRYGIFKFKKLFSSY,<ps2>,MKDMPTWIDEYHKGSRFGLNGKVLLKKNSKYQEILIIETDFYGKALMLDGCWMTSVRDEKYYHECLVHPALSSIDKKSHILIIGGGDGGTARECLKYSQVSKIDLVEIDEEVIKVSKTFLKEIGGGAWSDKRLAIHIDDGVKWVETTKDNSYDVIFIDCSDPSEFSNLLFTDSFYKECKRILTKKGILATQSESPESFENIHIHILKSLNKIFKLSETMYSFVPIYPSGIWSWTFASDEELNLSKVNYKEVMEIENNCDVWNLNFQNAAFKMMPNKIVKKLNS,<
|
| 67 |
+
<ps1>,LQISNNNYPWPDDWGRKTSIMGIINLTPDSFSDGGDFCSIEKVLNQVNYFVSNGVDVIDLGAQSTRPGAIEIGAKNESKRLIPYLKKIRSEYPNILISIDTFNSEVAHEALSNGANWINDVTGGRRDEEILDVVSEFNCPFVITHSRGNSITMNNLTNYDDFLVDIIHSLESLTKKALNKNVSKDKIIWDPGIGFSKDTKQNIEILRNVPLLKNFEFPLLIGASRKRFIGEILNQPNPKERDIGTLAISCLCSQQKIHLVRVHNVKINYQVLKVADHIFR,<ps2>,VIPSDTPINQHSLQSLELWLKDLGATKDIDNPSKWYLLLSNWNATIIFEQEDLSVVWESGGKLTKRLFSYCINREDIENAILQGP,<
|
| 68 |
+
<ps1>,MKAKPETTAHVSVKEYCFTKKEVKGVVEASDFKWTFTWSFGKGVLFVTPPLGRALIQDSLLRFFLKKDYELEAGNEYKFIISAKF,<ps2>,MDICLLNIDNNSNKSLNPTSVIGMLWLQTHFEDTQWEALSNNQVIISKENSKLLVKDAISAGLKIKSFSGVSMLDVFQKKN,<
|
| 69 |
+
<ps1>,MKNKVFPFIKKYPMSILLAIIAINLFSIASSLRTEAYLNREKNLCIKYLKHQIDRDTLIKKLRIVKQANPSSICDSVLKS,<ps2>,MNKFEFFKTDAIQSSYGGQFSYKVIGPCCRLYDREELPWPCSRLAWRSKEPSWRRIGARFVADMASRKCPSYSVQILEPGSKPVETVITLFSKKFSSEIQEWWYSKKPGSKEPGNVLPESI,<
|
| 70 |
+
<ps1>,MDYKTSGVDIKAGREFVSEIKQSVESTYSSNVLEGIGGFGGLFKIPLEGLKKPVLVSGTDGVGTKLELAQIKNFHFEVGIDLVAMCMNDIITTGAKPLFFLDYIATGKLEKNQLLEVINGIAHSCRENKCSILGGETAEMPGFYSKNKYDLAGFCVGIADEEKLINGKKICENDLIIALQSNGMHSNGFSLVRKIIENNNQIDKQFEKKYNLDFYDELLKPTKIYFKIVNQILSQNIQIKGMSHITGGGIPENLPRCMSSDFIPYIDKKSWKIPVLFEFLKDVGQIPEKDFWNTFNLGVGFCLIIDKKYKDKILNICNAFDISSWVLGKVLKKNNSKENNFLPEIII,<ps2>,MYFQDIIQNLNKFWSEEGCLIMQPYDTEKGAGTMNPHTFLRAIGPEPWSVAYAEPCRRPTDGRFGDNPNRAQHYFQYQVIIKPSPDEIQEKYLTSLEFLGINPKDHDIRFVEDNWESPTLGAWGVGWEVWLDGMEVTQFTYFQQCGGIDCNPIPIEITYGLERIAMFLQDKESIWDLNWNKDINYSDIWLQFEKNQCSFNFSNSNPENMRKLFAIYQEEANSLIEKDLTYPALDFVLKCSHCFNLLDARGVISVTDRAQYIEKIRKLAREVATSWIKERELMNFPLVKK,<
|
| 71 |
+
<ps1>,MKNFTKNNYSTKRNDTENRRSQSKNNFKKGNDLNTRDDSNRRDNSNRRDNSNRRDNSNKRDNSNRRDDLNRRDDFNRRDNFKRRDDSKRRDNFKSRDDLNRRYDFNRRDNFKRRDDSNRRDDFKRRDDYERKGAIKSNEYSYLKSKEKPRNSFNQSQTRFSSNAQQTENYSENSSKKFQLSPNERNYEDWIWGKHSVFAALNSERPINRIWCTSEIFSSEKFYLLLKDLKSKGVLIEEVPWSRLSQLTSGAVHQGVALQHASTESISLEKLIDISKSKSSNPIIVALDGVTDPHNFGAIIRSAEAFDCKGIIVPQRRSAGLTGTVAKVAAGALEHIPVSRVVNLNRAIDELKKKGFIIIGLSGDGQVPISEFKEKAPVVVIVGAENKGISLLVQKKCDYLLKIPLKGKTSSLNASVAAAISLCYLSNN,<ps2>,MELPCRRFGRTNLKMPVLSLGGMRFQKSWDELKFSEISRKEQNKVENILNLANKFGFNHIETAKYYGTSEIQLGMGFKSIEKKPKIIQTKIPPNRDPKLFEAELLKSFEKLQVKKIDLLAIHGINTPEHLHQAVKDGGCIDILKKFQQENLIGYIGFSTHGELSLIEKAITTNLFDYINLHWYFINQTNSKLIELAHKYDLGVFIISPTDKGGHLHTPSTKILELCSPLHPIVFNDLFCLRNKYVHTISVGIAKEQDFNLHLEAVSLLSESDHYIPKILNRLKEESINSLGIEWYKSWDKNLPNWKNTPGGINIPVLLWLANLIDSFDLEEFAKSRYQLLGNGSHWFPGNNANLLDVNVCESQLLKVLERHIKPKKVIKKLRVLKDKFGDKSLKRLSKN,<
|
| 72 |
+
<ps1>,MSESKSPLDRIYRLIASHAWMTENEAKVLLVMMYASGTKSLGLEGKGLNKFMERSLEKMCSDNKENLQEYLLKIKDKFPNNELLSED,<ps2>,MEPTSSLNRGDRKKGSSLVTGSEVQSQSNGASCFITTDSEKSLVSRQASQVEQIELRTYVFLDSLQPQLAAYMGTVSRGFLPIPGDSCLWMEVSPGMAVHRVTDIALKASNVRLGQMIVERAFGSLALYHKDQSTVLHSGDVVLDAIGSEVRKRTKPSTSWTEVICAITPDHAVLINRQNRSGSMIQSGMSMFILETEPAGYVLKAANEAEKSANITIIDVKAVGAFGRLTLAGKEGDVEEAAAAAIRAIDQISNY,<
|
| 73 |
+
<ps1>,MTMNNLKTKKLVNLGPSGRAVAQPMDVSLLDNFYEHLTMERYANVQYFSIYLWFQERDLDGFASHFLSESQGEMEHAYKFANYFIARGQTVKLKELPAPIQTWDSIEDIISYSFNMEADLTSSLQQLYSISERISDTRTSVFLDPIVDAQTKSEDEFAHILGKVKFAANQPSAILLIDSDLKKK,<ps2>,MFLKDHLKDTYQKASFDNNHLMLENIINIWAHRFGPESLNELFVKDQDQDQLKLIEENQAEASQNQINLELIEDHQSEANQNQTNLELIEEHQSEVNQNQINLELLKNLQYEEKIEFKPKETKKSNNTEIINKDIYGSYKNESEFKDKEELPLPNIKNLRKWINNEKKAS,<
|
| 74 |
+
<ps1>,MIILHIGLFENSFSNIMKSVIFQETANLKKPVPAEKVIELSDKLLEPSSHSKRYPPRLHKTWGTIFFMIAIHLLSLLALQPQFWSMPAVTALFFFYWLTACLGVTLGYHRLLSHRSFVVPKWLERFFATCGAISCQHGPIDWVGLHRHHHSFSDTEVDHHNSKRGFWWSHMGWMFKDVEALKAVPKLSADLIKDPYYRFLNKYFLFLQIPIGLCLYAIGQKLGVGGWALVLWGIPLRLVVVYHITWLVNSATHCWGKAPFESGDGSKNNAWVAALTFGEGWHNNHHAFPNSARQGLFRGQIDLTWEHIKILAKLGFAKKVKLPSRSYY,<ps2>,LNKKLGHKDHFHFIGIGGIGMSAIAMALIKKGYSVSGSDLIQNKETKSLKTLGAIIFDSQIKKNIDFVISKFQDHTLNCVISSAIKDENEELCFCKKNNLSIKHRSEILAMIMNSYTSLSIAGSHGKTSTSTFLSTLLELCTHDSSSITGGIIPIYDSNAHIENTKYLVTEIDESDGTIKNYNSDIGIINNIDFDHCDHYSNIDEVLSSFKKFASNCQKLLINYDCKFTKNNFTSKNQWSIKESNNIAYSLIPNIINKDKTVGKYYEHGKFIDIINIPVPGLHNLSNITAAIAACRMVGVSFKEIKKNTESLKLPKKRFEFRGEINQRIIYDDYAHHPNEIKATIDLARLFIKDKNSSDREEKGRLIAIFQPHRFTRVKQFIHEFVKELSKADVIYVTNIFGAGEKNIDNIDSQLIANLIYKNNKNVTCLKDNYEINEKFFKLTKKNDFIINMGAGDCHNLWSILKNKNTLNN,<
|
| 75 |
+
<ps1>,MDINWASTQIVKNLDRHEKRDLLAWILTQSERTFQRAFEAGQYSSAIGSLKLIWEMTIKDSKEKDSRYHGNYKH,<ps2>,MSKLHLKRFLKKSYEFSLVLFQFFIIILHFIHLEFIPKKEIMQVNFFFSFVGFLLIIISTIVMLISIKDLGRNLSPFPRPTVNGNLTTSGIYSFIRHPMYYSLILISFGFFITKLSFYHLFLTISLALIIKLKIILEEKYLNKKFKNYFIYTDKVKY,<
|
| 76 |
+
<ps1>,VHKNKILVPLSNNSYEVIIKQGLINNIGEELIRIGINSNRKILIVSNKEISTLFGRKLLNNLKKNNFNAEIFNIKAGESHKNFASLSEIFNAAFEVGLDRNSLLIALGGGIVGDVTGFAAATWLRGIEYIQIPTTLLSMVDSSVGGKTAVNHPKGKNLIGAFYQPKAVFIDPETLITLPTREFKAGMAEVIKYGVIKDKSLFEYLENEKNRDKILNLENESLIKIINKSIKTKACIVSEDEKENGIRAILNYGHSFGHVIENLCGYGEYLHGEAISIGMKIAGDIATEKNLWSKEHSLRQDHLIESYGLPIQTPKIKKNDVMKILMGDKKVRNGKMRFILPIELGEVDIFNDINESQFLKYFN,<ps2>,MKKIWKIEKLVLPQHSDHAGVMWHGTYFDWLEEGRINALSKAGLNYVDLTKNGFDLPLIDTSIKYISPLFLGDTVTIETIFEISKSPKIKIHSKFINKSKTILTIAKVNLVLINKKSFSIIRKRPDFISKAFLKLNG,<
|
| 77 |
+
<ps1>,MNDLNIEFPLDDFEELISQIGWSSLDEWFIFWNLKKEMLSINNFWDDNVKDDWIWGLALPLLSQAYKLNKKSPDRKIIGISALPGTGKTTLGKWLESISLKLKFKLSVISIDDFYLPSEEMEFAIKNNPWNVSRGFPGSHSIDLMKEKLLKWKTDGQLNVPVFDKSLRKGLGDRAHWREESPDLLIIEGWFLGVKPLSIDLDNSEKFSPPLSVFESSYRNKIQNNLDQYLDIWNMIDQIWHLKPLKFEYLNEWKSNQEKSMHFKSGSSLKGDNLSNFLRMLNVSIPHKSFDDINSDVLLMINQERKLVRVGLNQQISK,<ps2>,MLTTKITYALSDWIREWRKCRKENPSLDDCIKFTEWKIENYELTDSDRMIIESILLYETEET,<
|
| 78 |
+
<ps1>,VNITFLGTSSGVPTLTRNVSSLALKLSQTAEVWLFDCGEGTQHQLMKSNIKSSQIKKIFITHMHGDHIYGLPGLLATLGLSGNSNGIEIYGPSELKSFVTSALESSFCKLSFPLRFRAVEDFASLNKILFENDKLKVHCACLKHRLPAYGYRVSEKDKPGVFDIKKAEDSNIPPGPIYSELQAGKTVQLKDGRSFNGQDFCGPPRKGESFVYCTDTVFSKSAVNLSKNADLLVHESTFSKEDEKMAYEKLHSTTIMAAKTALLSNVKKLIITHLSPRYTQRSSIKPSDLLKEAQKIFPNTYLAKDFLTAEIK,<ps2>,MKLSKKFEELIIKQLESFGCSMGVTHLVMYLASTEQGTKASFEMIGQWPQIDRLLVSVEDDPSLKVSSPNRRWYPLQENDILLGVLRVETDLKEGNWPVSLDSRLKALSLSLAKCVSIELERQNKNEEINYLKSQVNVIIHQLRNPLAALRTYAKLLIKRLGSDVDSIEIVERMIIEQKQINNYMDSFAQLNSPIQLPLDIGEERLLLPPNLDNKKLITVQSLLRPILERGQANANLENRDWTEPSLWPDWTLSPLKAKYAVIAEIVANLLENAFKYAHKDAEIGVAIMSKGLCIFDDGKKITKIENEKIFQKGFRGSAAKKKDGTGVGLFLARKLAKQIGGELRLLENSSINDVEELKSFKKKNIFYLELPIKELHS,<
|
| 79 |
+
<ps1>,MIENPSQIVKEISDEKEIENSTIEENTSDTPKEEDLSFDHKDIPSADSSSSRRNNDLDTAGFTQEEFASLLGKYDYNFKPGDLVKGTVFALEPKGAMIDIGAKTAAFMPMQEVSINRVEGLSDVLQPSESREFFIMSEENEDGQLALSIRRIEYQRAWERVRQLQKEDATIYSEVFATNRGGALVRVEGLRGFIPGSHISARKIKEDLEGEYLPLKFLEVDEERNRLVLSHRRALVEKKMNRLEVGEVVIGSVKGIKPYGAFIDIGGVSGLLHISEISHEHIETPHNVLNVNDQMKVMIIDLDSERGRISLSTKALEPEPGDMLTDPQKVFNKAEEMAAKYKQMLLEQTDENEEQTVEIAESV,<ps2>,LSRSLDLPSTEGVDTLAQELAKLQDNGKRRIAFLGSRHVPVVDIHLIELIARSLAEEGHTILTSGSQGVNAAVIRAVLGINPSLLTVLLPQSLDKQLPEIKNQLESVIHLVEKSENDELPLPMASSLCNQEIINRCDQLICFAFHDSETLLNSCRCAEEMGKVVSLLFFD,<
|
| 80 |
+
<ps1>,MKKKLAAVSFSALLAIVASSTTSGFASWNTKYWTNEKNFNRISSFNVSENLPEGSKSTTKTSSEVVTASEDGKTLMYTDSDLGVVGLVDISDPAKPKALGIVELEAEPTGIAALGNNIYIGSNTSESYTNPSGALVQYNLDKRRAVKECDLGGQPDSVFVSPDGSFLAVAIENERDEEYKDGQIPQLDEDGKQINPAGYVSLVKLNKKGKIQCNSIKKVDLTGLASIAPSDPEPEFVAINDLGETVVSIQENNHLAVIDKEGKVISHFTAGIVKQMAGMDTKKDGAHKFKKKLKNVRREPDGLTWIDNDHFATANEGDYKHKAPGQAKRGGSRSWTIFKKDGTVVYEDANRLERSIAQIGHFQDGRAGKKGVEPESVTFGKIDGTPYLFVGAERAGIVAVYDITELSQPVLTQLLPSGIGPEGFVAIPDRGLIASANEKDYNKKEPGLSSHVTIYQLQDAPASYPHLTNENGLEFVSWGAISGMVSGEDGKIYAVNDGTFKTQPRIYVIDPSSSPALLERAIDIKLDGKTALFMDQEGITTDGRGGFYISTEGIKKKLTEHPPAIYHVSSEGDILEKITPPPSYLNYAKNPGFEGITRNGNILYIAQQKPWGDDTFNTTKILSYNLISKQWGAVNYQLDRIKKGGVGISELTYHDGALYVIERDSFYGKKAKLKAIYKVDLDGVVFEGLQTTMPPRLYPLVEKELVTDLKPVMKSTGGFILEKVEGLAINNDGQAWISTDNDGTGKKSTGETLFLNIGKI,<ps2>,MKNLKSNKRKIHRKVAAISSIPLLITLISGTIYSFLQPLGVDAFWLIKWHTGNFGIINLQPFYSIFLGIASIISVISGIRLLQKNS,<
|
| 81 |
+
<ps1>,MGFIKNKLFIFIILILLQSCSGGRIGNFFESSFKNIEETKIKEDVKNNLKNKIVIKSGGIVEKNKNIEETKIKEDVKNNLKNKIVIKSGGIVEKNKNIEETKIKEDVKNNLKNKVLKMSEKKSKNNKKISDKNISPKKIIFQPKSYKIIFILKDVDPKDPTEDLRAILRNSDVNFEIEKIERYFDTKNKTIKSN,<ps2>,MKWIIQEEKEEDHLQILNKDSEIGIDEVGRGSVFGPVFSVAVVLSKKSGLTLKKLGVNDSKKLTPKKRKDFFPKIIALSSDYALGQSSVREIDLLGIRHATELSMIRAVKKLKHMPSELLIDGPLTLRLWEGNQRNIISGDSKFISIATASIIAKVMRDSLMERLESKYPGYFIFKNKGYGTKQHFSSLKKHGLTNLHRKSFLNKLNLI,<
|
| 82 |
+
<ps1>,MELPCRRFGRTNLKMPVLSLGGMRFQKSWDELKFSEISRKEQNKVENILNLANKFGFNHIETAKYYGTSEIQLGMGFKSIEKKPKIIQTKIPPNRDPKLFEAELLKSFEKLQVKKIDLLAIHGINTPEHLHQAVKDGGCIDILKKFQQENLIGYIGFSTHGELSLIEKAITTNLFDYINLHWYFINQTNSKLIELAHKYDLGVFIISPTDKGGHLHTPSTKILELCSPLHPIVFNDLFCLRNKYVHTISVGIAKEQDFNLHLEAVSLLSESDHYIPKILNRLKEESINSLGIEWYKSWDKNLPNWKNTPGGINIPVLLWLANLIDSFDLEEFAKSRYQLLGNGSHWFPGNNANLLDVNVCESQLLKVLERHIKPKKVIKKLRVLKDKFGDKSLKRLSKN,<ps2>,MIFRNKRSSIKKTNILSQDELIKHYGINSYEFTHQEKKEIFVCSKVKEFDLIELDQLLQTVGWSRRPIRRVKRALEFSILVVGLWRHDEKFPRLVGFARCTGDGIIEATIWDVAINPVYQGLGLGKELMKYILQELKKIGISKVTLFADAEVVSFYKRQGWELEPKGSKCAFWYAN,<
|
| 83 |
+
<ps1>,MIIIEGFHIFNHKQNCKTKAEWMEQSGMTYDRESEVN,<ps2>,MQIKILVKLFSHLIKVIFKPLLGFAKFFITTYGVFLKFFLQLNGGYWGKIGIGQYSKIERKRFFCILPFYILLALLFGILSLIYWYFVVLFIPFWIERYLTDTAQWNNIFSSIMAFALICGWLLLLSKTK,<
|
| 84 |
+
<ps1>,VEGKNTSITFDGREIRLTTGLYAPQAGGAVMIECGDTSLLVTATKTTKKQAADFLPLICDYEEKLYAAGRIPGGFMRREGRPPERATLIARLIDRPMRPLFPSWMRDEIQIVASCLSLDERVPADVLGVTGASIATLLAEIPFYGPMAAVRVGLIGDDFILNPSYREIEKGDLDIVVAGSPEGIVMIEAGANQLSEQDTIEAIDFGYEAVSELIKAQENLLKDLGIKQVKPLEPEEDKALATYLEKNCTKPIDLILKKFDQSKEERDLELDKIELEVQTKIDSLKDDNQLKVLTSENEKLIHSDFKKLTKKLMRSQIINEGKRVDGRDLDEVRKISASAGILPKRVHGSALFQRGLTQVLSTTTLGTPSDAQEMDDLNPSTEKTYLHHYNFPPYSVGETRPMRTPGRREIGHGALAERAITPVLPGKETFPYVLRVVSEVLSSNGSTSMGSVCGSTLSLLDAGVPLKAPVSGTAMGLIKEGKEVRILTDIQGIEDFLGDMDFKVAGTEKGITALQMDMKITGLPVSVISDAIKKARPARLHILEKMQEAIDKPQESLSPHAPRLLSFRIDPELIGTVIGPGGRTIKGITERTNTKIDIEDGGIVTIASHDGAAAEEAQKIIEGLTRKVHEGEIFPGVVTRIIPIGAFVEILPGKEGMVHISQLSEARVERVEDVVRQGDEVTVRVREIDSRGRINLTLRGVAQNGGMSYPEPTPTPVAPLN,<ps2>,MPKQLSFSNESREALEKGINTVANAVKVTIGPKAKNVVIERKFGSPDIVRDGSTVAKEINLDNPISNLGAKLIEQVASKTKESAGDGTTTATILTQIMVQEGLKNIAAGASPIELKKGMEKGLNFVLEKLRSKSIKINGSDIKKVATVSAGGDEDIGSIISKAMDIVTSDGVITVEESQSLETELDITEGMSFDRGYSSPYFVTDQERQICELENPKILITDQKISTLTNLVPILEEVQKSASPFLILAEDIEGEALTTLVLNKNSGVLNVSAVRAPSFGERRKAALEDIAILTGAKLISEDQSMKLEEVTLNDLGKAKKITISKDKTTIVAFDDTKDLVQERVEKLKREVEITESEYDKDKINERIAKLAGGVALIKVGAATETEMKYKKLRIEDSLNATKAAIEEGVVSGGGQTLIEISNELSNSRKEISDDLTTGIDIITNALLEPTKQIAKNAGFNGDVVIADIKRLGKGFNANNGEYENLNESGILDPTKVIRLALQDSVSIAAMIITTEVAVADIPEPEAAPGGPGADPMGGMGGMGGMGGMGGMGMPGMGGMGMPGMGGMGMPGMGGMGMPGMM,<
|
| 85 |
+
<ps1>,MILSLLLSTFITIFIAELGDKTQLATLTMSGTSNKPLAVFLGSSSALVLASLVGALAGGSISNFLPEIILKSIASITFFIIGIRLFVNSFTSKENDNNQ,<ps2>,LKNLLGCSVKDLEKIALNYGQAAFRGRQIYNWLYNYKNRSKSIDEINVLPLKFRDQLKNEAFLFGELTLKEKYLATDGTLKLLLNTRDNESVECVGIPTEKRLTACLSSQVGCPMDCKFCATGKEGLKRSLKVSEILDQILFIENQMNQKVSNIVFMGMGEPLLNIDELLLSIRSINEDFAISQRKITVSTVAIPKMISKLSELSFQVLGKCQFTLAISLHASNQKIREAIIPSAKNYHIKNIIDDCREYVRETGRRVSFEYLMLHGVNDKLEHADELSNLIKGFQCHVNLIQYNHIEEVEFKQTPIKNAQLFQTRLSNSGINVSFRKSRGSDRNAACGQLRQNDKIK,<
|
| 86 |
+
<ps1>,MYSLEISLRYSPFPLSIQKKEYEDIKRIYDEIKDSMNSDNQNSPLIELSCEKVQDKLITVLAKEVISVQIYEKSAVAGGSKRPGFSLDI,<ps2>,LARDFPLERVRNIGIAAHIDAGKTTTTERILFYSGVVHKIGEVHDGAAVTDWMAQERERGITITAAAISTSWQDHRINIIDTPGHVDFTIEVERSMRVLDGVIAVFCAVGGVQPQSETVWRQADRYSVPRMVFVNKMDRTGADFLKVNQQIKDRLKANAFPIQLPIGAEGDLSGIIDLVSNKAYLYKNDLGTDIEEAPIPDEMKDEALEWRSKLMESVAENDEELIEIFLDKGELTEDQLKKGIREGVLKHGLVPVLCGSAFKNKGVQLVLDAVVDYLPAPIDVKPIQGVLPNGKEDVRPSDDNAPFSALAFKVMSDPYGKLTFVRMYSGVLSKGSYVMNSTKDAKERISRLVILKADEREEVDELRAGDLGAVLGLKNTTTGDTLCNTDDPIVLETLFIPEPVISVAVEPKTKGDMEKLSKALQALSEEDPTFRVSTDQETNQTVIAGMGELHLEILVDRMLREFKVEANIGAPQVSYRETIRSSSKGEGKYARQTGGKGQYGHVVIEMEPAEVGKGFEFVNKIVGGTVPKEYIGPASNGMKETCESGVLAGYPLIDVKVTLVDGSFHDVDSSEMAFKIAGSMAFKDGVKKCNPVLLEPMMKVEVESPDDFLGSVIGDLSSRRGQVEGQSVDDGLSKVQAKVPLAEMFGYATQLRSMTQGRGIFSMEFANYEEVPRNVAEAIISKNQGNS,<
|
| 87 |
+
<ps1>,VKKSLFKPSRKFTLFSAFVTLLNDRLSESILLPILPSFVLLFDSKASTYGLLSCTYQLAQFTASPFIGLMSDRYGRRPVTLFCITGSIIGISILSFTVLFDWSTSLATIPLFLLFIARLIDGLSGGTAATATTILADISSPEKRAKTFGLIGVAFGLSFFLGNIFVVIFAKNTNNNFIIPVIIASIIPIINFILVFFYLPETKPQNELNKSTQILKNPLKQLFKVFKEEKIRKLSLAFFIYFIAFTGLTNILIFFLQESLNWTTKASSGTLVVVGVIAIIVQGGLIGPLVKKFGEMRLTLIGSGFILLACFLLITTPQKNAIVNIYSAVSFLAVGAGLITPTLRALISKKLDGDNQGSILSNLQGLQSLGGVLGIGMAGKVYDDFGPKAPFIAGSIILLFMIYLIAEGKNNNISYN,<ps2>,MTDIFEVPTPDNELLEKAKQLRLASIKTSQTNNDDRIRALNLMADYLEKNSKEIIEANIEDYKKAEIKGISKSLLSRLKLSKEKLNLGIEGVRQVGNLIDPVGQIQIKRELSKGLILERKTVPIGVLGVIFESRPDAVMQISSLAIRSGNGVMLKGGSEANLTNLAIVSALKEGLQDSNLDENAICLLTSRKDSMAMLNLEKYINLIIPRGSNELVKFIQENTEIPVLGHADGICHLYIDNEVNLDMALKVALDSKIQYPAACNAVETLLIHKDTASEFLNKAIPMFNSNDVKLIGDKKSFQLGVAFEANYEDWQTEYLDLILSIKIVNDLEEAIAHIQKFSSKHTDGIITENINNANKFMSEIDSSGVFHNCSTRFADGFRYGFGAEVGISTQTLPPRGPVGLEGLVTYKYFLRGEGHIVDDFSSGKLIYSHKDV,<
|
| 88 |
+
<ps1>,MQNITFKGNVNFDNQKEELNENELFSLKITDSLYKKDIGKFLEILSSHFIP,<ps2>,MRVVIAGAGLAGLSCAKYLVDNGHIPIVLEARDVLGGKVAAWKDEDGDWYETGLHIFFGAYPNMLQLFKELDIEDRLQWKSHSMIFNQPSEPGTYSRFDFPDIPAPANGVTAILSNNDMLSWNEKILFGLGLVPAMLRGQKYLDKCDSKSWTEWLKEHNIPERVNDEVFIAMSKALNFIGPDEISSTVLLTALNRFLQEKNGSKMAFLDGAPPERLCQPMVDYITERGGEVHMNSPLRKIDLNEDSTVKSFTIAPLDSDEKKKVITADAYVSAMPVDLFKLIIPDQWKGINAFSKLDGLIGVPVINIHLWFDKKLTDIDHLLFSRSPLLSVYADMSITCKEYEDPNRSMLELVFAPAKEWINRSDQDIVDATMEELKKLFPTHFIGDDKTKLRKFKVVKTPRSVYKAVPGCQEFRPSQRSPIKNFFLAGDYTMQKYLASMEGAVLSGKLCAETINKEYSKTSNIVSRETSKIN,<
|
| 89 |
+
<ps1>,MLENIWHPSYSAAEYLGITEIKLSHLRENGYFKPGIHWKSSPLGQKKPWNPEVLYNSILCRKIMDEFYSEEKNDQYAA,<ps2>,MRNLIKENIKKTGNNSSRSIKKLLKQRSFVVFISILLTGLGASITSISFKTGIYFINNWRLELLNHFPSVAVLPLFGAVGGAIAGFLIKNFAPAAKGSGVSQIMGFLRHKKVPMNLKVGLVKLISGIIAIGSGFPLGPEGPSVQMGGSVAWQMARWLKAPLAFRRVIVAAGGGAGIAAVFSAPLGGFIYAIEELLNSARPVILLLVVITTFIADSSADIIQALGLDPKAGGFDFNLGFLIQKEYDPSVFFLPIDFIYLVLLGIIIGLFAELYSKYVLFMQKLGKKWYKNKFVLKMSICGLLLGSIYSFLPSSFHNLDELQKIIVEKNTNIEIAFLAVFILFITTGLAAASGAPGGLFYPMLTLGGAIGLIMGTWVEIATGHAPSTYIFAGMGAFVAGCSRTPITAMFLAFALTKNLLIMKPVLISCIASFLVARAFNEESIYERQIQIELED,<
|
| 90 |
+
<ps1>,MKKKSITYTDLSKKQLQHLKELYIQKKVECMSHKELKEFVLEIISHQINDTIGKEEEMEAWMEMSKFYGDQFEIIILEIQQKFANNENLQNFEEDSKEHRLELLEKNNIEQNKQDMWDD,<ps2>,MKEIGWPTIDSKHLVVYSKQMLDLENEIFSQGMPQEALMEKVGIQLSKWLLKRKSLLKKGVIVFLGPGHNGGDGAVIAKELFLKGYLVKLWCPFPLKKTITINYVNYLTSLGVEILGDSPNPEGKDLWIDAIFGNNQKRKVDEELIELFNKKFEKRSGKVVSIDVPTGLCPNSGKPFLKNAVKADFNLVVGLNKIGLLQDTALPYVGELHHIDIGICRSQLCKLESKILKISYQDLRTIKLPLLPKNSSKYKRGRTLVIAGSEKYPGAAYLAIKGAISSGAGFVSAIIPNLVSNSIWQVEPEVVVTGSLSSDKNGNSILFNALKNVDFSAYDSIVIGPGIGLNEEDWEKSTQYLLDLKGLLILDADALNRISKSNLGPKFFLERKSKTWITPHNKEFMRLFPEIDCTNKVELAKKAAKAFDISILLKGANSVIANNENAWQLFGTDAETSRAGLGDLLSGFIGGCSSIELSSRDYTKTESLAKYVFLHSFAASKCKKGSNASLIGAQLSKLMRKTKTRLMS,<
|
| 91 |
+
<ps1>,MRQHVNPLSKNFFEIDPIPPLNQVFENPKLPLHLDIGCASGEFLFELSLKNKNWNYIGIEIREKLVLNANLKMKSRENKNLYFSFGNANNIFNQTNNKSIINLITSISFNFPDPWFKKKHHKRRVIQPKLLNLLSNSMKKGSLIFIKTDVRDLFDHMELTISESIKFKKIPYQDVDFCESFNPNRIQTNREKYVILNQLKIYESIYKKI,<ps2>,MTDDINPIESDFNAALSRYQDGQELIPIAQDFQKIIQQIPNHFAAWTCLSWLQLLLKNNEEALAAAREAVRLNQQDPQARMNLSLALLATNNKGVRDHVELIKKMAMMMPDVKTELKESVEDGFNRYPNWPELTKINKWLEF,<
|
| 92 |
+
<ps1>,VLNDTLSSKFICFHLINISNKLNSTLKIELANPNKSEMFELKSYEKFRDTEDVRFFDISINNSNFRDLVIHNGPAVSPPNDKELGNWQFYIHHKQEDNLLAISGGRTFYLVNLGWEYPFYKVRLESCGLILKIPRGTFHRSVSDENGSVVLNQAIRDKGGSVESEFKVTNSKDNKKLHDCITNLQPKFKIYSVK,<ps2>,LIMSLIPLLPVFHKFNRQFFDQSLTTNREPLVKVRWSDNRLKTTAGFYKRKQLKGVIDSEIILSKPILSKLSCNEIHSTLCHEMIHAWVDRILNINEIHGPNFLSKMNEINKAENNFQISIRHNFPVERKALKYTGKCLNCGEKYMYRKRIKNIACKKCCNLFFNGSWNKKCLILFD,<
|
| 93 |
+
<ps1>,MIKDHPIFLESIRFIKSNLIENNFNYLENRVLERLVHTSGDFNIQKLLEFSEGACEKGVKSLKAGAPILTDTDMAAAAIKSMAKNTNGNLVVSAKHWFDDRDLSELTKTAYGIEKGWIELSANNSGNQSPIIVIGSSPTALVNLLEIIQNSQQIPSLIIGMPVGFIGVRQSKNKLLNTNYPRIVMNSTRGGAAMAAAAVNALLRESI,<ps2>,MREEDIKSFEDAFFDALNLFNNQKWYEAHDAFEDIWNTLEGDERQIIQGIIQVSVSQFHLSKGNLNGATILMGEGLGRIKNRTNIDLGVDLVSFCKCLDELLRKLQYKEELTKNDKPYLLIKEQNEF,<
|
| 94 |
+
<ps1>,MRVKLEPETAFIGKKFAYIFLGIIFALNSIVFIWYFFFSNLTWS,<ps2>,LFQSLFSTKYNYFYNIYIVFHIRTSILLLSGLVLGLWTSWPGIVIPNNWKCFKDMIEKSSKE,<
|
| 95 |
+
<ps1>,VSENIQPSSEENQIVEDLTNKESPEKLPEFKDKELITNLEQNRFECRSCGYIYDPIEGNKKLNIPKNTPFSAIDGNTFACPVCRAGKNLYKDIGPREKPSGFEENLTYGFGFNSLPPGQKNILIFGGLAFAAACFLSLYSLH,<ps2>,LIGIFSAFGAAISWTYACFIWRSQTEKYKSIDINLVKNIIAFLIFLPAFINLSVLNNLKSIITLLFSGVIGIGLGDTFYIKSLQLIGTRRTLSIETLSPILAALSGEIFINENLAFRSYQGILIISISLFILLRQRTNLIVNNLTNITERNNLSVYVFPFLSVLCAVSGGLLSRKVFLESNLSPFQATEIRLLGAIIFLIIIKKFRINFFLKKLDFNDQKRFLLSILLGTNLGILLQQIVFKTLPLGIGWTLLSTSPVISLFFATKEEGQITKGIIFFTTLLFLGLCLIII,<
|
| 96 |
+
<ps1>,MKILLSVFFLFAFIPPSKGVTTKMFKVLDTCARYRLGEIDAKQAIEKLKLKSVNSSEIDLKNIVSNYCSVFTPNENIKF,<ps2>,MPKKHPTRRQFLNFGKLSLLFFLNSCSNSLKKIKIGFQSSTYPKSFRDTFPAIWQKENINFSKLKLEKNKIKFSKSDFILINDGWLKSINFANFQNINNLFLNDLLDNRSRDYLKSFKEYQRNKLFPIGVVPYAVIIKNNKDLIYEASNNWDFLLDEKLKGKIIFPQSPRILISISKRINVKNSLSKLKEQAMLFDDKNSINWLINSDASVAIIPFSLCEKYLRVDSRLSMVFPNKGVPLMWNFLLTKSKINNIVLFDWIKSLEKRSTIDELANQGWYLPFKNEYSQDKYNIKTENSNYGPSENCWENSWSFSSLNYEEKVNLENLWNQS,<
|
| 97 |
+
<ps1>,MRFKVSLKKDGKEFDEVVIANNKKDAIEVALKNNPEAEVLNSDWTFKL,<ps2>,MNDHNSKDNYEAQTLILNDSNGNELFCYLEQIVKVEEKEYALLTPVDTPVSLFKINENDEPELIEKIEKNEQVLKNADAVLQEHDLKLIRSAVTLTVSGELEEPIYDELEEDGIEEESETYELLVSFNLLEQEYGLYIPLDPFFIVGKLINQGALLIEDDEFDKVQPLIESELEKSSF,<
|
| 98 |
+
<ps1>,MENSKPNYWQNAERTNGRMAMMGFFALVVNYGLFGWIIPGIF,<ps2>,LMNKYDVIIIGSGIGGLCCGSLLALAGKKVLIAEAHSQPGGVAHSFNMRGYKFESGPSLWSGIGKWPTTNPLGQILRLLDEKVELIKYQGWHVNVPEGEFNLEVGQEPFKERIRLLRGEKSVKEWDSFVSGIRPLSQIVSEIPLLSSSPETINFLEIIKLASKFLPNIKSLPKLNGGFGDIVDSHLNDPFLRNWVDLLSFLISGMPMHDTNSAAMATLFDEWFKPASYLEYPKGGSESIVKALVDSFKKNGGELILSSKVEAVNFSKNIASGVTLENGSNFISNFVVMNTDAWTSRKLIPQEFQKKWSPKAKDINKCGSFLHIHLGFDASGLQNLPIHAIHVDNWERGITAERNVAVFSIPSVLDKSMAPKGKHVLHGYTPANEPWEIWKNLKSNELAYKELKEERCSIFLKSLRKIIPDIDNRIEIKLLGTPLTHKKYTNTYCGSYGPALSAAQGLFPGCKTSVRNLLTCGASTFPGIGIPAVSASGAYAAEKIMGKKEYKKLLKTIDL,<
|
| 99 |
+
<ps1>,VFFKSNFSYSDSNKSYSDLLLELDSGNIQSIYFYPRKREIDVLYKNGNKEKIPILYNDQLILEKASENNVDLTINNSRKESSAANSFASVGLFLIFIIAIVLILKSTSKLASKALGFGKNKSKFVTIDDVETRFDDVAGVPEAAEELKEVIKFLNEPKKFTDLGAKVPKGVLLIGPPGTGKTLLAKAIAGESGVPFLSIAASEFVELFVGVGASRVRDLFEKAKEKSPCIIFIDEIDSIGRQRGSGIGGGNDEREQTLNQLLTELDGFADNSGIIVIAATNRPDILDSALLRPGRFDRKIEVMLPDLDGRKKILSVHSLSKPLAKDVDLSYWATRTVGFSGADLANLMNESAIHCAREDSKLITYSHIENALDKVTLGLRTSIISSQNMKKIIAYNEVGRAIVSAVKNGVDSVDKITILPRSGYLGGYTKINPDEDIVSSGLISKKLLLSKIEIALAGRAAEIIVYGKNEITQCSFNDISYATSIIREMVTKYGFSIIGPLSLEDGGEMSIGDGFVRNKSTIADNTYSRIDNEIINISKISLNNAIKIISNNRILLEKLVELLLIKETVENNTFKKITFDLLKV,<ps2>,MRRKIFFEVFNIKKLSILVLGFTLGVIAIWPGIISRNSRKCFFNIIKDGSDGNIQIKTILLVNPNYLLRIKNAKNDYWKVLLVGDACFRKF,<
|
| 100 |
+
<ps1>,MVTSQKKGPDSSASDNELSPDQTLGLVSLSLMQKLSQKDPSFSWLGEMKPDQLNLKNLRDRLELTELAIKTGAPLTTSEVSILMGAKPGKSKIERGGILAIKVARNVWKLSKLGQGSSYYRN,<ps2>,MILKVLEFEFDLIVLGAGSGGLAAAKRAASYGAKVAIIEVNKIGGTCVIRGCVPKKLMVYAANNRRNMLSSEGYGLISKEITFESNILLKNVREEVSRLSVLHSNSLKKLNVKVFEGLGRFLNQNTVEVVCPKTKNILRKVSAKSILISVGGKPKKLNIPGTDFAWTSDDIFELKDFPKKLLIVGGGYIACEFASIFKNLGTEVTQLIRGENLLNGFDKDLSECLEKSMTSLGINLKFKNQLKSIKKINDGLESTLESGSKLLTDNILVATGREPSLKRLNLDTLNLKMDGIYLEVNELNKTSISNIFAIGDIVKRPNLTPVAIEQGRVFADNYFAALKRKVNYENIPKAVFTIPEISTVGLSEEKANEIYSEVNVQVFKCNFTPMSNTFKKNKSKCMLKLVVNKKNDKVLGCHMFGEAASEIIQMVAVSLNTGITKKDFDTTMALHPTISEEFVTMYG,<
|
| 101 |
+
<ps1>,MFFLSIPQAWHLAGTWSEQLPNDSNLIGMSQTELMMTLHSIFVPLLLVISYFLFLKISKNESKKVKG,<ps2>,MTSTLPNDNIKNIDEKISNKLISEIIRDRIKSKGTRFSANDNIADFINPGELKVLEKEVASRIKDLLKSLVIDVDNDHNTQETAERVSKMYLNEVFKGRYHEQPKVTSFPNDKNLDEIYTVGPITVRSACSHHLVPILGECWIGIKPGSKVIGLSKFARVADWVFSRPHIQEEAVMILADEIEKLCEPKGLGIIVKAQHYCMKWRGVKEPNTSMINSVVRGDFRHDISLKQEFFELVRQQSSNNNY,<
|
README.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
library_name: pytorch
|
| 4 |
+
tags:
|
| 5 |
+
- protein-protein-interaction
|
| 6 |
+
- ppi
|
| 7 |
+
- protein-language-model
|
| 8 |
+
- gpt-2
|
| 9 |
+
- nanogpt
|
| 10 |
+
- character-level
|
| 11 |
+
- trained-from-scratch
|
| 12 |
+
- bioinformatics
|
| 13 |
+
- biology
|
| 14 |
+
pipeline_tag: text-generation
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# ppiGPLM
|
| 18 |
+
|
| 19 |
+
A GPT-2 small protein language model trained from scratch on protein-pair prompts and used for binary protein-protein interaction (PPI) classification via next-token prediction. The implementation is based on [nanoGPT](https://github.com/karpathy/nanoGPT) by Andrej Karpathy, with character-level tokenization over amino acids.
|
| 20 |
+
|
| 21 |
+

|
| 22 |
+
|
| 23 |
+
## Overview
|
| 24 |
+
|
| 25 |
+
ppiGPLM uses a GPT-2 small architecture (12 layers, 12 attention heads, 768 embedding dimensions) with character-level tokenization to predict whether two proteins interact. Rather than using a separate classification head, ppiGPLM frames PPI prediction as next-token prediction: given a structured prompt encoding a protein pair, the model predicts a binary label (`0` or `1`) as the next token. Softmax probabilities over the label tokens provide continuous interaction scores.
|
| 26 |
+
|
| 27 |
+
The model was developed for the *Prochlorococcus marinus* MED4 interactome, where it serves as one component of a tri-model consensus framework for computational PPI screening.
|
| 28 |
+
|
| 29 |
+
## Architecture
|
| 30 |
+
|
| 31 |
+
| Parameter | Value |
|
| 32 |
+
|-----------|-------|
|
| 33 |
+
| Architecture | GPT-2 small |
|
| 34 |
+
| Layers | 12 |
|
| 35 |
+
| Attention heads | 12 |
|
| 36 |
+
| Embedding dimension | 768 |
|
| 37 |
+
| Context length | 4,096 tokens |
|
| 38 |
+
| Tokenization | Character-level (one token per amino acid) |
|
| 39 |
+
| Dropout | 0.2 |
|
| 40 |
+
| Optimizer | AdamW (lr = 5e-4, beta2 = 0.99) |
|
| 41 |
+
| Training iterations | 8,000 |
|
| 42 |
+
|
| 43 |
+
## Installation
|
| 44 |
+
|
| 45 |
+
### Prerequisites
|
| 46 |
+
|
| 47 |
+
- Python 3.8+
|
| 48 |
+
- CUDA-capable GPU (recommended) or CPU
|
| 49 |
+
- conda (recommended) or pip
|
| 50 |
+
|
| 51 |
+
### Setup
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
# Clone the repository
|
| 55 |
+
git clone https://github.com/kouroshSA/ppiGPLM.git
|
| 56 |
+
cd ppiGPLM
|
| 57 |
+
|
| 58 |
+
# Create a conda environment
|
| 59 |
+
conda create -n gpt python=3.10
|
| 60 |
+
conda activate gpt
|
| 61 |
+
pip install -r requirements.txt
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Repository Structure
|
| 65 |
+
|
| 66 |
+
```
|
| 67 |
+
ppiGPLM/
|
| 68 |
+
|-- model.py # GPT model definition
|
| 69 |
+
|-- train_.py # Training loop
|
| 70 |
+
|-- sample_fasta3.3_softmax_error_handling3e.py # Batch inference script
|
| 71 |
+
|-- LES-wrapper.py # Learning Efficiency Score evaluation wrapper
|
| 72 |
+
|-- LES-wrapper.md # LES-wrapper documentation
|
| 73 |
+
|-- roc_analysis_color_threshold_F1e.py # ROC curve analysis
|
| 74 |
+
|-- configurator.py # Configuration utility
|
| 75 |
+
|-- config/
|
| 76 |
+
| |-- train_par_gpt2-s_scratch.py # Training config (GPT-2 small, from scratch)
|
| 77 |
+
| +-- finetune_label3.py # Fine-tuning config
|
| 78 |
+
|-- data/
|
| 79 |
+
| +-- MED4_char/ # MED4 PPI dataset
|
| 80 |
+
| |-- prepare.py # Character-level tokenizer
|
| 81 |
+
| +-- meta.pkl # Vocabulary (stoi/itos mappings)
|
| 82 |
+
|-- assets/
|
| 83 |
+
| |-- ppiGPLM.png # ASCII workflow diagram
|
| 84 |
+
| |-- tri_model_consensus.svg # Tri-model consensus framework (SVG)
|
| 85 |
+
| +-- tri_model_consensus.png # Tri-model consensus framework (PNG)
|
| 86 |
+
|-- requirements.txt
|
| 87 |
+
|-- LICENSE
|
| 88 |
+
+-- README.md
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## Usage
|
| 92 |
+
|
| 93 |
+
### Prompt Format
|
| 94 |
+
|
| 95 |
+
Each prompt encodes a protein pair with metadata tags:
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
<ps1>,MSEQ1...,<ps2>,MSEQ2...,<l1>,len1,<l2>,len2,<l3>
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
- `<ps1>`, `<ps2>`: Protein sequence delimiters
|
| 102 |
+
- `<l1>`, `<l2>`, `<l3>`: Length field delimiters
|
| 103 |
+
- The model predicts `1` (interacting) or `0` (non-interacting) as the next token
|
| 104 |
+
|
| 105 |
+
### Batch Inference
|
| 106 |
+
|
| 107 |
+
Run inference on a set of protein pairs:
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
python sample_fasta3.3_softmax_error_handling3e.py \
|
| 111 |
+
--input_file protein_pairs.txt \
|
| 112 |
+
--output_dir ppi_results \
|
| 113 |
+
--output_prefix my_predictions
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
This produces:
|
| 117 |
+
- `*_classifications.txt`: Full model output in FASTA-like format
|
| 118 |
+
- `*_probabilities.csv`: Per-pair probabilities for class 1 and class 0
|
| 119 |
+
|
| 120 |
+
### Training
|
| 121 |
+
|
| 122 |
+
#### Prepare data
|
| 123 |
+
|
| 124 |
+
```bash
|
| 125 |
+
python data/MED4_char/prepare.py
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
This creates `train.bin`, `val.bin`, and `meta.pkl` from the input training data.
|
| 129 |
+
|
| 130 |
+
#### Train the model
|
| 131 |
+
|
| 132 |
+
```bash
|
| 133 |
+
# Single GPU
|
| 134 |
+
python train_.py config/train_par_gpt2-s_scratch.py
|
| 135 |
+
|
| 136 |
+
# Multi-GPU (2 GPUs)
|
| 137 |
+
torchrun --standalone --nproc_per_node=2 train_.py config/train_par_gpt2-s_scratch.py
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
### Learning Efficiency Score (LES) Evaluation
|
| 141 |
+
|
| 142 |
+
The LES-wrapper automates evaluation across multiple training checkpoints, computing ROC-AUC, F1, and optimal threshold at each checkpoint and deriving integrated Learning Efficiency Scores:
|
| 143 |
+
|
| 144 |
+
```bash
|
| 145 |
+
python LES-wrapper.py \
|
| 146 |
+
--checkpoint_dir out \
|
| 147 |
+
--prs_file PRS.txt \
|
| 148 |
+
--rrs_file RRS.txt \
|
| 149 |
+
--output_dir LES_results \
|
| 150 |
+
--vanilla
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
See [LES-wrapper.md](LES-wrapper.md) for full documentation.
|
| 154 |
+
|
| 155 |
+
### Standalone ROC Analysis
|
| 156 |
+
|
| 157 |
+
```bash
|
| 158 |
+
python roc_analysis_color_threshold_F1e.py \
|
| 159 |
+
--prs_file ppi_results/PRS_probabilities.csv \
|
| 160 |
+
--rrs_file ppi_results/RRS_probabilities.csv
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
## Architecture Diagrams
|
| 164 |
+
|
| 165 |
+
The ASCII workflow diagram (`assets/ppiGPLM.png`) covers:
|
| 166 |
+
- **A.** Prompt-based input strategy (character-level tokenization)
|
| 167 |
+
- **B.** Model architecture (GPT-2 small, causal self-attention)
|
| 168 |
+
- **C.** Training pipeline
|
| 169 |
+
- **D.** Inference pipeline with LES evaluation
|
| 170 |
+
|
| 171 |
+
> Note: the diagram lists "Flash Attention" — this path is taken automatically
|
| 172 |
+
> when running on PyTorch ≥ 2.0; older versions fall back to the manual
|
| 173 |
+
> scaled-dot-product implementation. Numerical results are equivalent.
|
| 174 |
+
|
| 175 |
+
See `assets/tri_model_consensus.svg` for the tri-model consensus framework with [ppiDCE](https://github.com/kouroshSA/ppiDCE) and [ppiBTEP](https://github.com/kouroshSA/ppiBTEP).
|
| 176 |
+
|
| 177 |
+
## Citation
|
| 178 |
+
|
| 179 |
+
If you use this software, please cite:
|
| 180 |
+
|
| 181 |
+
```
|
| 182 |
+
Daakour, S. et al. (2026).
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
This software is built on nanoGPT:
|
| 186 |
+
|
| 187 |
+
```
|
| 188 |
+
Karpathy, A. (2022). nanoGPT. https://github.com/karpathy/nanoGPT
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
## License
|
| 192 |
+
|
| 193 |
+
This project is licensed under the MIT License. See [LICENSE](LICENSE) for details.
|
| 194 |
+
|
| 195 |
+
The original nanoGPT framework is by Andrej Karpathy (MIT License, 2022). Modifications and additions for protein-protein interaction prediction are by Kourosh Salehi-Ashtiani (MIT License, 2026).
|
assets/MANUSCRIPT_NUMBERS_TO_VERIFY.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Numbers in tri_model_consensus.svg that must match the manuscript
|
| 2 |
+
|
| 3 |
+
- "~15,000 protein pairs" (input cohort: low-confidence Y2H interactions)
|
| 4 |
+
- "P >= 0.95" (per-model consensus threshold)
|
| 5 |
+
- "657 Tri-Model-Supported Interactions" (intersection output)
|
| 6 |
+
|
| 7 |
+
If these change in the manuscript, update the SVG and re-render the PNG.
|
assets/ppiGPLM.png
ADDED
|
Git LFS Details
|
assets/tri_model_consensus.png
ADDED
|
Git LFS Details
|
assets/tri_model_consensus.svg
ADDED
|
|
checkpoints/ppiGPLM_ckpt_7e.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ppiGPLM_ckpt_7e.pt
|
| 2 |
+
|
| 3 |
+
**Checkpoint used for screening low-confidence Y2H pairs in the *Prochlorococcus marinus* MED4 interactome.**
|
| 4 |
+
|
| 5 |
+
## Provenance
|
| 6 |
+
|
| 7 |
+
| | |
|
| 8 |
+
|---|---|
|
| 9 |
+
| Model | ppiGPLM (GPT-2 small, character-level tokenization, trained from scratch on protein-pair prompts) |
|
| 10 |
+
| Architecture | 12 layers, 12 attention heads, 768 emb dim, 4,096 token context |
|
| 11 |
+
| File size | ~1009 MB |
|
| 12 |
+
| Training run | `out_7e` — epoch 70.94 (≈ epoch 71) |
|
| 13 |
+
| Training data | MED4 PPI dataset (`data/MED4_char/`) — prepared by `data/MED4_char/prepare.py` from the project's input CSV/TXT |
|
| 14 |
+
| Tokenization | Per-amino-acid characters (one token per residue), prompts wrapped with `<ps1>`, `<ps2>`, `<l1>`, `<l2>`, `<l3>` delimiter tokens |
|
| 15 |
+
|
| 16 |
+
## Intended use
|
| 17 |
+
|
| 18 |
+
Inference / screening of candidate MED4 protein–protein interactions that
|
| 19 |
+
were originally flagged as **low-confidence Y2H hits**. The model is prompted
|
| 20 |
+
with `<ps1>,Seq_A,<ps2>,Seq_B,<l1>,len_A,<l2>,len_B,<l3>` and the next-token
|
| 21 |
+
softmax over `0` / `1` is used (alongside the other tri-model components,
|
| 22 |
+
[ppiBTEP](https://huggingface.co/kouroshSA/ppiBTEP) and
|
| 23 |
+
[ppiDCE](https://huggingface.co/kouroshSA/ppiDCE)) to retain or discard the
|
| 24 |
+
pair.
|
| 25 |
+
|
| 26 |
+
## Loading
|
| 27 |
+
|
| 28 |
+
Use `sample_fasta3.3_softmax_error_handling3e.py` from the parent repo, or
|
| 29 |
+
load the checkpoint directly with `train_.py` / `model.py`. The vocabulary
|
| 30 |
+
(`stoi` / `itos` mappings) is read from `data/MED4_char/meta.pkl`.
|
checkpoints/ppiGPLM_ckpt_7e.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f909843f84f1be924fa411de7b50246b408ded09795b45940e34534a29b46e5
|
| 3 |
+
size 1057547653
|
config/finetune_label3.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
out_dir = 'out-FT'
|
| 4 |
+
eval_interval = 50
|
| 5 |
+
eval_iters = 10
|
| 6 |
+
wandb_log = False # feel free to turn on
|
| 7 |
+
wandb_project = 'Classification4'
|
| 8 |
+
wandb_run_name = 'algaGPT2-S_set4x2_resume'
|
| 9 |
+
|
| 10 |
+
dataset = 'MED4_char'
|
| 11 |
+
init_from = 'resume' # this is the largest GPT-2 model
|
| 12 |
+
|
| 13 |
+
# only save checkpoints if the validation loss improves
|
| 14 |
+
|
| 15 |
+
always_save_checkpoint = False
|
| 16 |
+
|
| 17 |
+
# the number of examples per iter:
|
| 18 |
+
# 10 batch_size * 1 grad_accum * 1024 tokens = 10240 tokens/iter
|
| 19 |
+
# has xx tokens, so 1 epoch ~= xx iters
|
| 20 |
+
batch_size = 48
|
| 21 |
+
gradient_accumulation_steps = 2
|
| 22 |
+
max_iters = 2000000
|
| 23 |
+
|
| 24 |
+
# finetune at constant LR
|
| 25 |
+
learning_rate = 1e-5
|
| 26 |
+
decay_lr = False
|
| 27 |
+
|
| 28 |
+
# To resume training (1GPU) a ckpt in 'shakespear-out' run this 'python train_gpt2-S_resume.py config/finetune_label3.py'
|
| 29 |
+
# To resume training GPT2-M or -S on 2 GPUs: torchrun --standalone --nproc_per_node=2 train_.py config/finetune_label3.py
|
config/train_par_gpt2-s_scratch.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Training config: GPT-2 small (124M) from scratch on character-level protein-pair prompts (MED4 PPI).
|
| 2 |
+
|
| 3 |
+
out_dir = 'out_14e'
|
| 4 |
+
eval_interval = 250 # keep frequent because we'll overfit
|
| 5 |
+
log_interval = 10 # don't print too too often
|
| 6 |
+
|
| 7 |
+
# we expect to overfit on this small dataset, so only save when val improves
|
| 8 |
+
always_save_checkpoint = True
|
| 9 |
+
|
| 10 |
+
wandb_log = True
|
| 11 |
+
# override via command line if you like
|
| 12 |
+
wandb_project = 'ppiGLM_MED4Solo'
|
| 13 |
+
wandb_run_name = 'ppiGPLM-med4_4k_14epoch'
|
| 14 |
+
dataset = 'med4_4k_14epoch'
|
| 15 |
+
init_from = 'scratch' # train from random init; no pretrained GPT-2 weights loaded
|
| 16 |
+
gradient_accumulation_steps = 2
|
| 17 |
+
batch_size = 12
|
| 18 |
+
|
| 19 |
+
block_size = 4096 # context of up to n previous characters
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# GPT2-M models
|
| 23 |
+
n_layer = 12
|
| 24 |
+
n_head = 12
|
| 25 |
+
n_embd = 768
|
| 26 |
+
dropout = 0.2
|
| 27 |
+
|
| 28 |
+
# using above parameters, gradient = 10, batch = 16, token/iter = 98304, epoch = 97600
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
learning_rate = 5e-4 # with baby networks can afford to go a bit higher
|
| 32 |
+
max_iters = 8001
|
| 33 |
+
lr_decay_iters = 8000 # make equal to max_iters usually
|
| 34 |
+
min_lr = 1e-5 # learning_rate / 10 usually
|
| 35 |
+
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
|
| 36 |
+
|
| 37 |
+
warmup_iters = 200 # not super necessary potentially
|
| 38 |
+
|
| 39 |
+
# on macbook also add
|
| 40 |
+
# device = 'cpu' # run on cpu only
|
| 41 |
+
# compile = False # do not torch compile the model
|
| 42 |
+
|
| 43 |
+
# To tokenize the training data: python data/MED4_char/prepare.py
|
| 44 |
+
#
|
| 45 |
+
# To train GPT-2 small from scratch on a single GPU:
|
| 46 |
+
# python train_.py config/train_par_gpt2-s_scratch.py
|
| 47 |
+
#
|
| 48 |
+
# To train on 2 GPUs with DDP:
|
| 49 |
+
# torchrun --standalone --nproc_per_node=2 train_.py config/train_par_gpt2-s_scratch.py
|
configurator.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Poor Man's Configurator. Probably a terrible idea. Example usage:
|
| 3 |
+
$ python train.py config/override_file.py --batch_size=32
|
| 4 |
+
this will first run config/override_file.py, then override batch_size to 32
|
| 5 |
+
|
| 6 |
+
The code in this file will be run as follows from e.g. train.py:
|
| 7 |
+
>>> exec(open('configurator.py').read())
|
| 8 |
+
|
| 9 |
+
So it's not a Python module, it's just shuttling this code away from train.py
|
| 10 |
+
The code in this script then overrides the globals()
|
| 11 |
+
|
| 12 |
+
I know people are not going to love this, I just really dislike configuration
|
| 13 |
+
complexity and having to prepend config. to every single variable. If someone
|
| 14 |
+
comes up with a better simple Python solution I am all ears.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import sys
|
| 18 |
+
from ast import literal_eval
|
| 19 |
+
|
| 20 |
+
for arg in sys.argv[1:]:
|
| 21 |
+
if '=' not in arg:
|
| 22 |
+
# assume it's the name of a config file
|
| 23 |
+
assert not arg.startswith('--')
|
| 24 |
+
config_file = arg
|
| 25 |
+
print(f"Overriding config with {config_file}:")
|
| 26 |
+
with open(config_file) as f:
|
| 27 |
+
print(f.read())
|
| 28 |
+
exec(open(config_file).read())
|
| 29 |
+
else:
|
| 30 |
+
# assume it's a --key=value argument
|
| 31 |
+
assert arg.startswith('--')
|
| 32 |
+
key, val = arg.split('=')
|
| 33 |
+
key = key[2:]
|
| 34 |
+
if key in globals():
|
| 35 |
+
try:
|
| 36 |
+
# attempt to eval it it (e.g. if bool, number, or etc)
|
| 37 |
+
attempt = literal_eval(val)
|
| 38 |
+
except (SyntaxError, ValueError):
|
| 39 |
+
# if that goes wrong, just use the string
|
| 40 |
+
attempt = val
|
| 41 |
+
# ensure the types match ok
|
| 42 |
+
assert type(attempt) == type(globals()[key])
|
| 43 |
+
# cross fingers
|
| 44 |
+
print(f"Overriding: {key} = {attempt}")
|
| 45 |
+
globals()[key] = attempt
|
| 46 |
+
else:
|
| 47 |
+
raise ValueError(f"Unknown config key: {key}")
|
data/MED4_char/meta.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:983cb09bbc0138f093476ba5db6ec8acaefb7e45211a1194560f7c2abbf4c93c
|
| 3 |
+
size 343
|
data/MED4_char/prepare.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prepare the Shakespeare dataset for character-level language modeling.
|
| 3 |
+
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
|
| 4 |
+
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
|
| 5 |
+
encoder and decoder and some other related info.
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
import pickle
|
| 9 |
+
import requests
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# download the tiny shakespeare dataset
|
| 13 |
+
input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
|
| 14 |
+
if not os.path.exists(input_file_path):
|
| 15 |
+
data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
|
| 16 |
+
with open(input_file_path, 'w') as f:
|
| 17 |
+
f.write(requests.get(data_url).text)
|
| 18 |
+
|
| 19 |
+
with open(input_file_path, 'r') as f:
|
| 20 |
+
data = f.read()
|
| 21 |
+
print(f"length of dataset in characters: {len(data):,}")
|
| 22 |
+
|
| 23 |
+
# get all the unique characters that occur in this text
|
| 24 |
+
chars = sorted(list(set(data)))
|
| 25 |
+
vocab_size = len(chars)
|
| 26 |
+
print("all the unique characters:", ''.join(chars))
|
| 27 |
+
print(f"vocab size: {vocab_size:,}")
|
| 28 |
+
|
| 29 |
+
# create a mapping from characters to integers
|
| 30 |
+
stoi = { ch:i for i,ch in enumerate(chars) }
|
| 31 |
+
itos = { i:ch for i,ch in enumerate(chars) }
|
| 32 |
+
def encode(s):
|
| 33 |
+
return [stoi[c] for c in s] # encoder: take a string, output a list of integers
|
| 34 |
+
def decode(l):
|
| 35 |
+
return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
|
| 36 |
+
|
| 37 |
+
# create the train and test splits
|
| 38 |
+
n = len(data)
|
| 39 |
+
train_data = data[:int(n*0.9)]
|
| 40 |
+
val_data = data[int(n*0.9):]
|
| 41 |
+
|
| 42 |
+
# encode both to integers
|
| 43 |
+
train_ids = encode(train_data)
|
| 44 |
+
val_ids = encode(val_data)
|
| 45 |
+
print(f"train has {len(train_ids):,} tokens")
|
| 46 |
+
print(f"val has {len(val_ids):,} tokens")
|
| 47 |
+
|
| 48 |
+
# export to bin files
|
| 49 |
+
train_ids = np.array(train_ids, dtype=np.uint16)
|
| 50 |
+
val_ids = np.array(val_ids, dtype=np.uint16)
|
| 51 |
+
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
|
| 52 |
+
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
|
| 53 |
+
|
| 54 |
+
# save the meta information as well, to help us encode/decode later
|
| 55 |
+
meta = {
|
| 56 |
+
'vocab_size': vocab_size,
|
| 57 |
+
'itos': itos,
|
| 58 |
+
'stoi': stoi,
|
| 59 |
+
}
|
| 60 |
+
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
|
| 61 |
+
pickle.dump(meta, f)
|
| 62 |
+
|
| 63 |
+
# length of dataset in characters: 1115394
|
| 64 |
+
# all the unique characters:
|
| 65 |
+
# !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
|
| 66 |
+
# vocab size: 65
|
| 67 |
+
# train has 1003854 tokens
|
| 68 |
+
# val has 111540 tokens
|
data/MED4_char/readme.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# character-level
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
(gpt) ksa@ksa-Z790-UD-AC-1:~/Desktop/ppiGPT_MED4_solo/data/shakespeare_char$ python prepare.py
|
| 6 |
+
|
| 7 |
+
length of dataset in characters: 6,159,128
|
| 8 |
+
all the unique characters:
|
| 9 |
+
,012<>ACDEFGHIKLMNPQRSTVWYps
|
| 10 |
+
vocab size: 29
|
| 11 |
+
train has 5,543,215 tokens
|
| 12 |
+
val has 615,913 tokens
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
epoch = 460,619,820 / 98304 = 4686 it
|
model.py
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Full definition of a GPT Language Model, all of it in this single file.
|
| 3 |
+
References:
|
| 4 |
+
1) the official GPT-2 TensorFlow implementation released by OpenAI:
|
| 5 |
+
https://github.com/openai/gpt-2/blob/master/src/model.py
|
| 6 |
+
2) huggingface/transformers PyTorch implementation:
|
| 7 |
+
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import math
|
| 11 |
+
import inspect
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
import torch.nn as nn
|
| 16 |
+
from torch.nn import functional as F
|
| 17 |
+
|
| 18 |
+
class LayerNorm(nn.Module):
|
| 19 |
+
""" LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
|
| 20 |
+
|
| 21 |
+
def __init__(self, ndim, bias):
|
| 22 |
+
super().__init__()
|
| 23 |
+
self.weight = nn.Parameter(torch.ones(ndim))
|
| 24 |
+
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
|
| 25 |
+
|
| 26 |
+
def forward(self, input):
|
| 27 |
+
return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
|
| 28 |
+
|
| 29 |
+
class CausalSelfAttention(nn.Module):
|
| 30 |
+
|
| 31 |
+
def __init__(self, config):
|
| 32 |
+
super().__init__()
|
| 33 |
+
assert config.n_embd % config.n_head == 0
|
| 34 |
+
# key, query, value projections for all heads, but in a batch
|
| 35 |
+
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
|
| 36 |
+
# output projection
|
| 37 |
+
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
|
| 38 |
+
# regularization
|
| 39 |
+
self.attn_dropout = nn.Dropout(config.dropout)
|
| 40 |
+
self.resid_dropout = nn.Dropout(config.dropout)
|
| 41 |
+
self.n_head = config.n_head
|
| 42 |
+
self.n_embd = config.n_embd
|
| 43 |
+
self.dropout = config.dropout
|
| 44 |
+
# flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
|
| 45 |
+
self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
|
| 46 |
+
if not self.flash:
|
| 47 |
+
print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
|
| 48 |
+
# causal mask to ensure that attention is only applied to the left in the input sequence
|
| 49 |
+
self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
|
| 50 |
+
.view(1, 1, config.block_size, config.block_size))
|
| 51 |
+
|
| 52 |
+
def forward(self, x):
|
| 53 |
+
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
|
| 54 |
+
|
| 55 |
+
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
|
| 56 |
+
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
|
| 57 |
+
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
| 58 |
+
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
| 59 |
+
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
| 60 |
+
|
| 61 |
+
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
| 62 |
+
if self.flash:
|
| 63 |
+
# efficient attention using Flash Attention CUDA kernels
|
| 64 |
+
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
|
| 65 |
+
else:
|
| 66 |
+
# manual implementation of attention
|
| 67 |
+
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
| 68 |
+
att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
|
| 69 |
+
att = F.softmax(att, dim=-1)
|
| 70 |
+
att = self.attn_dropout(att)
|
| 71 |
+
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
|
| 72 |
+
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
|
| 73 |
+
|
| 74 |
+
# output projection
|
| 75 |
+
y = self.resid_dropout(self.c_proj(y))
|
| 76 |
+
return y
|
| 77 |
+
|
| 78 |
+
class MLP(nn.Module):
|
| 79 |
+
|
| 80 |
+
def __init__(self, config):
|
| 81 |
+
super().__init__()
|
| 82 |
+
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
|
| 83 |
+
self.gelu = nn.GELU()
|
| 84 |
+
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
|
| 85 |
+
self.dropout = nn.Dropout(config.dropout)
|
| 86 |
+
|
| 87 |
+
def forward(self, x):
|
| 88 |
+
x = self.c_fc(x)
|
| 89 |
+
x = self.gelu(x)
|
| 90 |
+
x = self.c_proj(x)
|
| 91 |
+
x = self.dropout(x)
|
| 92 |
+
return x
|
| 93 |
+
|
| 94 |
+
class Block(nn.Module):
|
| 95 |
+
|
| 96 |
+
def __init__(self, config):
|
| 97 |
+
super().__init__()
|
| 98 |
+
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
|
| 99 |
+
self.attn = CausalSelfAttention(config)
|
| 100 |
+
self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
|
| 101 |
+
self.mlp = MLP(config)
|
| 102 |
+
|
| 103 |
+
def forward(self, x):
|
| 104 |
+
x = x + self.attn(self.ln_1(x))
|
| 105 |
+
x = x + self.mlp(self.ln_2(x))
|
| 106 |
+
return x
|
| 107 |
+
|
| 108 |
+
@dataclass
|
| 109 |
+
class GPTConfig:
|
| 110 |
+
block_size: int = 1024
|
| 111 |
+
vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
|
| 112 |
+
n_layer: int = 12
|
| 113 |
+
n_head: int = 12
|
| 114 |
+
n_embd: int = 768
|
| 115 |
+
dropout: float = 0.0
|
| 116 |
+
bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
|
| 117 |
+
|
| 118 |
+
class GPT(nn.Module):
|
| 119 |
+
|
| 120 |
+
def __init__(self, config):
|
| 121 |
+
super().__init__()
|
| 122 |
+
assert config.vocab_size is not None
|
| 123 |
+
assert config.block_size is not None
|
| 124 |
+
self.config = config
|
| 125 |
+
|
| 126 |
+
self.transformer = nn.ModuleDict(dict(
|
| 127 |
+
wte = nn.Embedding(config.vocab_size, config.n_embd),
|
| 128 |
+
wpe = nn.Embedding(config.block_size, config.n_embd),
|
| 129 |
+
drop = nn.Dropout(config.dropout),
|
| 130 |
+
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
|
| 131 |
+
ln_f = LayerNorm(config.n_embd, bias=config.bias),
|
| 132 |
+
))
|
| 133 |
+
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
| 134 |
+
# with weight tying when using torch.compile() some warnings get generated:
|
| 135 |
+
# "UserWarning: functional_call was passed multiple values for tied weights.
|
| 136 |
+
# This behavior is deprecated and will be an error in future versions"
|
| 137 |
+
# not 100% sure what this is, so far seems to be harmless. TODO investigate
|
| 138 |
+
self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
|
| 139 |
+
|
| 140 |
+
# init all weights
|
| 141 |
+
self.apply(self._init_weights)
|
| 142 |
+
# apply special scaled init to the residual projections, per GPT-2 paper
|
| 143 |
+
for pn, p in self.named_parameters():
|
| 144 |
+
if pn.endswith('c_proj.weight'):
|
| 145 |
+
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
|
| 146 |
+
|
| 147 |
+
# report number of parameters
|
| 148 |
+
print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
|
| 149 |
+
|
| 150 |
+
def get_num_params(self, non_embedding=True):
|
| 151 |
+
"""
|
| 152 |
+
Return the number of parameters in the model.
|
| 153 |
+
For non-embedding count (default), the position embeddings get subtracted.
|
| 154 |
+
The token embeddings would too, except due to the parameter sharing these
|
| 155 |
+
params are actually used as weights in the final layer, so we include them.
|
| 156 |
+
"""
|
| 157 |
+
n_params = sum(p.numel() for p in self.parameters())
|
| 158 |
+
if non_embedding:
|
| 159 |
+
n_params -= self.transformer.wpe.weight.numel()
|
| 160 |
+
return n_params
|
| 161 |
+
|
| 162 |
+
def _init_weights(self, module):
|
| 163 |
+
if isinstance(module, nn.Linear):
|
| 164 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
| 165 |
+
if module.bias is not None:
|
| 166 |
+
torch.nn.init.zeros_(module.bias)
|
| 167 |
+
elif isinstance(module, nn.Embedding):
|
| 168 |
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
| 169 |
+
|
| 170 |
+
def forward(self, idx, targets=None):
|
| 171 |
+
device = idx.device
|
| 172 |
+
b, t = idx.size()
|
| 173 |
+
assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
|
| 174 |
+
pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
|
| 175 |
+
|
| 176 |
+
# forward the GPT model itself
|
| 177 |
+
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
|
| 178 |
+
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
|
| 179 |
+
x = self.transformer.drop(tok_emb + pos_emb)
|
| 180 |
+
for block in self.transformer.h:
|
| 181 |
+
x = block(x)
|
| 182 |
+
x = self.transformer.ln_f(x)
|
| 183 |
+
|
| 184 |
+
if targets is not None:
|
| 185 |
+
# if we are given some desired targets also calculate the loss
|
| 186 |
+
logits = self.lm_head(x)
|
| 187 |
+
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
|
| 188 |
+
else:
|
| 189 |
+
# inference-time mini-optimization: only forward the lm_head on the very last position
|
| 190 |
+
logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
|
| 191 |
+
loss = None
|
| 192 |
+
|
| 193 |
+
return logits, loss
|
| 194 |
+
|
| 195 |
+
def crop_block_size(self, block_size):
|
| 196 |
+
# model surgery to decrease the block size if necessary
|
| 197 |
+
# e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
|
| 198 |
+
# but want to use a smaller block size for some smaller, simpler model
|
| 199 |
+
assert block_size <= self.config.block_size
|
| 200 |
+
self.config.block_size = block_size
|
| 201 |
+
self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
|
| 202 |
+
for block in self.transformer.h:
|
| 203 |
+
if hasattr(block.attn, 'bias'):
|
| 204 |
+
block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
|
| 205 |
+
|
| 206 |
+
@classmethod
|
| 207 |
+
def from_pretrained(cls, model_type, override_args=None):
|
| 208 |
+
assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
|
| 209 |
+
override_args = override_args or {} # default to empty dict
|
| 210 |
+
# only dropout can be overridden see more notes below
|
| 211 |
+
assert all(k == 'dropout' for k in override_args)
|
| 212 |
+
from transformers import GPT2LMHeadModel
|
| 213 |
+
print("loading weights from pretrained gpt: %s" % model_type)
|
| 214 |
+
|
| 215 |
+
# n_layer, n_head and n_embd are determined from model_type
|
| 216 |
+
config_args = {
|
| 217 |
+
'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
|
| 218 |
+
'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
|
| 219 |
+
'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
|
| 220 |
+
'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
|
| 221 |
+
}[model_type]
|
| 222 |
+
print("forcing vocab_size=50257, block_size=1024, bias=True")
|
| 223 |
+
config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
|
| 224 |
+
config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
|
| 225 |
+
config_args['bias'] = True # always True for GPT model checkpoints
|
| 226 |
+
# we can override the dropout rate, if desired
|
| 227 |
+
if 'dropout' in override_args:
|
| 228 |
+
print(f"overriding dropout rate to {override_args['dropout']}")
|
| 229 |
+
config_args['dropout'] = override_args['dropout']
|
| 230 |
+
# create a from-scratch initialized minGPT model
|
| 231 |
+
config = GPTConfig(**config_args)
|
| 232 |
+
model = GPT(config)
|
| 233 |
+
sd = model.state_dict()
|
| 234 |
+
sd_keys = sd.keys()
|
| 235 |
+
sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
|
| 236 |
+
|
| 237 |
+
# init a huggingface/transformers model
|
| 238 |
+
model_hf = GPT2LMHeadModel.from_pretrained(model_type)
|
| 239 |
+
sd_hf = model_hf.state_dict()
|
| 240 |
+
|
| 241 |
+
# copy while ensuring all of the parameters are aligned and match in names and shapes
|
| 242 |
+
sd_keys_hf = sd_hf.keys()
|
| 243 |
+
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
|
| 244 |
+
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
|
| 245 |
+
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
|
| 246 |
+
# basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
|
| 247 |
+
# this means that we have to transpose these weights when we import them
|
| 248 |
+
assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
|
| 249 |
+
for k in sd_keys_hf:
|
| 250 |
+
if any(k.endswith(w) for w in transposed):
|
| 251 |
+
# special treatment for the Conv1D weights we need to transpose
|
| 252 |
+
assert sd_hf[k].shape[::-1] == sd[k].shape
|
| 253 |
+
with torch.no_grad():
|
| 254 |
+
sd[k].copy_(sd_hf[k].t())
|
| 255 |
+
else:
|
| 256 |
+
# vanilla copy over the other parameters
|
| 257 |
+
assert sd_hf[k].shape == sd[k].shape
|
| 258 |
+
with torch.no_grad():
|
| 259 |
+
sd[k].copy_(sd_hf[k])
|
| 260 |
+
|
| 261 |
+
return model
|
| 262 |
+
|
| 263 |
+
def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
|
| 264 |
+
# start with all of the candidate parameters
|
| 265 |
+
param_dict = {pn: p for pn, p in self.named_parameters()}
|
| 266 |
+
# filter out those that do not require grad
|
| 267 |
+
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
|
| 268 |
+
# create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
|
| 269 |
+
# i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
|
| 270 |
+
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
|
| 271 |
+
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
|
| 272 |
+
optim_groups = [
|
| 273 |
+
{'params': decay_params, 'weight_decay': weight_decay},
|
| 274 |
+
{'params': nodecay_params, 'weight_decay': 0.0}
|
| 275 |
+
]
|
| 276 |
+
num_decay_params = sum(p.numel() for p in decay_params)
|
| 277 |
+
num_nodecay_params = sum(p.numel() for p in nodecay_params)
|
| 278 |
+
print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
|
| 279 |
+
print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
|
| 280 |
+
# Create AdamW optimizer and use the fused version if it is available
|
| 281 |
+
fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
|
| 282 |
+
use_fused = fused_available and device_type == 'cuda'
|
| 283 |
+
extra_args = dict(fused=True) if use_fused else dict()
|
| 284 |
+
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
|
| 285 |
+
print(f"using fused AdamW: {use_fused}")
|
| 286 |
+
|
| 287 |
+
return optimizer
|
| 288 |
+
|
| 289 |
+
def estimate_mfu(self, fwdbwd_per_iter, dt):
|
| 290 |
+
""" estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
|
| 291 |
+
# first estimate the number of flops we do per iteration.
|
| 292 |
+
# see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
|
| 293 |
+
N = self.get_num_params()
|
| 294 |
+
cfg = self.config
|
| 295 |
+
L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
|
| 296 |
+
flops_per_token = 6*N + 12*L*H*Q*T
|
| 297 |
+
flops_per_fwdbwd = flops_per_token * T
|
| 298 |
+
flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
|
| 299 |
+
# express our flops throughput as ratio of A100 bfloat16 peak flops
|
| 300 |
+
flops_achieved = flops_per_iter * (1.0/dt) # per second
|
| 301 |
+
flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
|
| 302 |
+
mfu = flops_achieved / flops_promised
|
| 303 |
+
return mfu
|
| 304 |
+
|
| 305 |
+
@torch.no_grad()
|
| 306 |
+
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
| 307 |
+
"""
|
| 308 |
+
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
| 309 |
+
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
| 310 |
+
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
|
| 311 |
+
"""
|
| 312 |
+
for _ in range(max_new_tokens):
|
| 313 |
+
# if the sequence context is growing too long we must crop it at block_size
|
| 314 |
+
idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
|
| 315 |
+
# forward the model to get the logits for the index in the sequence
|
| 316 |
+
logits, _ = self(idx_cond)
|
| 317 |
+
# pluck the logits at the final step and scale by desired temperature
|
| 318 |
+
logits = logits[:, -1, :] / temperature
|
| 319 |
+
# optionally crop the logits to only the top k options
|
| 320 |
+
if top_k is not None:
|
| 321 |
+
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
| 322 |
+
logits[logits < v[:, [-1]]] = -float('Inf')
|
| 323 |
+
# apply softmax to convert logits to (normalized) probabilities
|
| 324 |
+
probs = F.softmax(logits, dim=-1)
|
| 325 |
+
# sample from the distribution
|
| 326 |
+
idx_next = torch.multinomial(probs, num_samples=1)
|
| 327 |
+
# append sampled index to the running sequence and continue
|
| 328 |
+
idx = torch.cat((idx, idx_next), dim=1)
|
| 329 |
+
|
| 330 |
+
return idx
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Optional: wandb is only imported when wandb_log=True is set in the training config.
|
| 2 |
+
torch>=2.0
|
| 3 |
+
numpy
|
| 4 |
+
tqdm
|
| 5 |
+
wandb
|
| 6 |
+
scikit-learn
|
| 7 |
+
matplotlib
|
roc_analysis_color_threshold_F1e.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
|
| 3 |
+
Notes:
|
| 4 |
+
|
| 5 |
+
The script handles variable numbers of PRS and RRS probability values.
|
| 6 |
+
It combines the probabilities and labels, computes the ROC curve, calculates the AUC (Area Under the Curve), and plots the ROC curve using Matplotlib.
|
| 7 |
+
You can customize the script to read multiple PRS and RRS files by modifying the read_probabilities function and how you handle the input arguments.
|
| 8 |
+
Dependencies:
|
| 9 |
+
|
| 10 |
+
Python 3
|
| 11 |
+
numpy
|
| 12 |
+
matplotlib
|
| 13 |
+
scikit-learn
|
| 14 |
+
|
| 15 |
+
Explanation of the Area Under the Curve (AUC):
|
| 16 |
+
|
| 17 |
+
The Area Under the Curve (AUC) refers to the area under the Receiver Operating Characteristic (ROC) curve. The ROC curve is a graphical representation of a classifier's performance across all classification thresholds. It plots the True Positive Rate (TPR) against the False Positive Rate (FPR) at various threshold settings.
|
| 18 |
+
|
| 19 |
+
True Positive Rate (TPR), also known as Sensitivity or Recall, is the proportion of actual positives that are correctly identified.
|
| 20 |
+
False Positive Rate (FPR) is the proportion of actual negatives that are incorrectly identified as positives.
|
| 21 |
+
The AUC provides a single scalar value that summarizes the performance of the classifier:
|
| 22 |
+
|
| 23 |
+
An AUC of 1.0 indicates a perfect classifier.
|
| 24 |
+
An AUC of 0.5 suggests no discriminative ability (equivalent to random guessing).
|
| 25 |
+
An AUC between 0.5 and 1.0 indicates the degree to which the classifier can distinguish between the positive and negative classes.
|
| 26 |
+
Why is AUC important?
|
| 27 |
+
|
| 28 |
+
More on interpreting the ROC Curve:
|
| 29 |
+
|
| 30 |
+
The ROC curve plots the TPR against the FPR at various threshold levels.
|
| 31 |
+
The closer the curve follows the left-hand border and then the top border of the ROC space, the better the classifier.
|
| 32 |
+
The diagonal line represents the performance of a classifier that makes random guesses.
|
| 33 |
+
Understanding AUC Values:
|
| 34 |
+
|
| 35 |
+
AUC = 0.90-1.00: Excellent
|
| 36 |
+
AUC = 0.80-0.90: Good
|
| 37 |
+
AUC = 0.70-0.80: Fair
|
| 38 |
+
AUC = 0.60-0.70: Poor
|
| 39 |
+
AUC = 0.50-0.60: Fail
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
Threshold-Independent: AUC measures the classifier's ability to rank predictions without being dependent on a specific threshold.
|
| 44 |
+
Performance Metric: It provides a comprehensive measure of performance across all possible classification thresholds.
|
| 45 |
+
In summary, the AUC quantifies the overall ability of the model to discriminate between positive and negative classes. A higher AUC indicates better model performance.
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
F1 = 2 * ( (precision * recall) / (precision + recall) )
|
| 50 |
+
|
| 51 |
+
precision = TP / (TP + FP)
|
| 52 |
+
|
| 53 |
+
recall = TP / (TP + FN)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
You can adjust the decimal percision by changing ".6f" to desired value in f'Best F1 Threshold: {best_thresh:.6f}'
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
"""
|
| 60 |
+
#
|
| 61 |
+
|
| 62 |
+
# pip install numpy matplotlib scikit-learn
|
| 63 |
+
|
| 64 |
+
# python roc_analysis_color_threshold_F1e.py --input_csv probabilities.csv --output_file roc_curve.png
|
| 65 |
+
|
| 66 |
+
#!/usr/bin/env python
|
| 67 |
+
|
| 68 |
+
#!/usr/bin/env python
|
| 69 |
+
import argparse
|
| 70 |
+
import numpy as np
|
| 71 |
+
import matplotlib.pyplot as plt
|
| 72 |
+
from sklearn.metrics import roc_curve, auc, f1_score
|
| 73 |
+
import csv
|
| 74 |
+
|
| 75 |
+
def read_probabilities_from_csv(filename):
|
| 76 |
+
"""Read PRS and RRS probability values from a CSV file."""
|
| 77 |
+
prs_probs = []
|
| 78 |
+
rrs_probs = []
|
| 79 |
+
with open(filename, 'r') as csvfile:
|
| 80 |
+
reader = csv.reader(csvfile)
|
| 81 |
+
headers = next(reader) # Skip the header row
|
| 82 |
+
for row in reader:
|
| 83 |
+
# Ensure there are at least two columns
|
| 84 |
+
if len(row) >= 2:
|
| 85 |
+
prs_value = row[0].strip()
|
| 86 |
+
rrs_value = row[1].strip()
|
| 87 |
+
# Append PRS probability if not empty
|
| 88 |
+
if prs_value:
|
| 89 |
+
prs_probs.append(float(prs_value))
|
| 90 |
+
# Append RRS probability if not empty
|
| 91 |
+
if rrs_value:
|
| 92 |
+
rrs_probs.append(float(rrs_value))
|
| 93 |
+
return prs_probs, rrs_probs
|
| 94 |
+
|
| 95 |
+
def main():
|
| 96 |
+
parser = argparse.ArgumentParser(description='Compute ROC curve, best F1 score, and annotate thresholds.')
|
| 97 |
+
parser.add_argument('--input_csv', required=True, help='CSV file containing PRS and RRS probability values')
|
| 98 |
+
parser.add_argument('--output_file', default='roc_curve.png', help='Output file name for ROC curve plot')
|
| 99 |
+
|
| 100 |
+
args = parser.parse_args()
|
| 101 |
+
|
| 102 |
+
# Read probability values from CSV file
|
| 103 |
+
prs_probs, rrs_probs = read_probabilities_from_csv(args.input_csv)
|
| 104 |
+
|
| 105 |
+
# Assign labels
|
| 106 |
+
prs_labels = [1] * len(prs_probs)
|
| 107 |
+
rrs_labels = [0] * len(rrs_probs)
|
| 108 |
+
|
| 109 |
+
# Combine probabilities and labels
|
| 110 |
+
probs = np.array(prs_probs + rrs_probs)
|
| 111 |
+
labels = np.array(prs_labels + rrs_labels)
|
| 112 |
+
|
| 113 |
+
# Compute ROC curve and AUC
|
| 114 |
+
fpr, tpr, thresholds = roc_curve(labels, probs)
|
| 115 |
+
roc_auc = auc(fpr, tpr)
|
| 116 |
+
|
| 117 |
+
# Exclude infinite thresholds and thresholds outside [0, 1]
|
| 118 |
+
finite_idxs = np.where(np.isfinite(thresholds))[0]
|
| 119 |
+
fpr = fpr[finite_idxs]
|
| 120 |
+
tpr = tpr[finite_idxs]
|
| 121 |
+
thresholds = thresholds[finite_idxs]
|
| 122 |
+
|
| 123 |
+
# Filter thresholds within [0, 1]
|
| 124 |
+
valid_thresholds_idxs = np.where((thresholds >= 0) & (thresholds <= 1))[0]
|
| 125 |
+
fpr = fpr[valid_thresholds_idxs]
|
| 126 |
+
tpr = tpr[valid_thresholds_idxs]
|
| 127 |
+
thresholds = thresholds[valid_thresholds_idxs]
|
| 128 |
+
|
| 129 |
+
# Compute best F1 score across thresholds
|
| 130 |
+
best_f1 = -1.0
|
| 131 |
+
best_thresh = None
|
| 132 |
+
best_idx = None
|
| 133 |
+
for i, thresh in enumerate(thresholds):
|
| 134 |
+
predicted_labels = (probs >= thresh).astype(int)
|
| 135 |
+
current_f1 = f1_score(labels, predicted_labels)
|
| 136 |
+
if current_f1 > best_f1:
|
| 137 |
+
best_f1 = current_f1
|
| 138 |
+
best_thresh = thresh
|
| 139 |
+
best_idx = i
|
| 140 |
+
|
| 141 |
+
# Retrieve FPR and TPR for the best threshold
|
| 142 |
+
best_fpr = fpr[best_idx]
|
| 143 |
+
best_tpr = tpr[best_idx]
|
| 144 |
+
|
| 145 |
+
# Set global font
|
| 146 |
+
plt.rcParams['font.family'] = 'Arial'
|
| 147 |
+
|
| 148 |
+
# Create figure and colormap
|
| 149 |
+
fig, ax = plt.subplots(figsize=(10, 8))
|
| 150 |
+
norm = plt.Normalize(vmin=thresholds.min(), vmax=thresholds.max())
|
| 151 |
+
cmap = plt.cm.viridis
|
| 152 |
+
|
| 153 |
+
# Plot the ROC curve in segments, color-coded by threshold
|
| 154 |
+
for i in range(len(fpr) - 1):
|
| 155 |
+
x = fpr[i:i + 2]
|
| 156 |
+
y = tpr[i:i + 2]
|
| 157 |
+
z = thresholds[i]
|
| 158 |
+
ax.plot(x, y, color=cmap(norm(z)), lw=2.5)
|
| 159 |
+
|
| 160 |
+
# Diagonal line
|
| 161 |
+
ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
|
| 162 |
+
|
| 163 |
+
# Add a colorbar for thresholds
|
| 164 |
+
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
|
| 165 |
+
sm.set_array([])
|
| 166 |
+
cbar = fig.colorbar(sm, ax=ax)
|
| 167 |
+
cbar.set_label('Threshold', fontsize=16)
|
| 168 |
+
cbar.ax.tick_params(labelsize=14)
|
| 169 |
+
|
| 170 |
+
# Annotate a subset of thresholds on the ROC curve
|
| 171 |
+
num_thresholds_to_annotate = 10 # Number of thresholds to annotate
|
| 172 |
+
idxs = np.linspace(0, len(thresholds) - 1, num_thresholds_to_annotate).astype(int)
|
| 173 |
+
for idx in idxs:
|
| 174 |
+
thresh = thresholds[idx]
|
| 175 |
+
ax.annotate(f'{thresh:.2f}', xy=(fpr[idx], tpr[idx]),
|
| 176 |
+
textcoords='offset points', xytext=(0, 10),
|
| 177 |
+
ha='center', fontsize=12, color='blue')
|
| 178 |
+
|
| 179 |
+
# No red scatter point for the best threshold
|
| 180 |
+
# ax.scatter(best_fpr, best_tpr, color='red', s=100, zorder=5)
|
| 181 |
+
|
| 182 |
+
# Set axis limits and labels
|
| 183 |
+
ax.set_xlim([0.0, 1.0])
|
| 184 |
+
ax.set_ylim([0.0, 1.05])
|
| 185 |
+
ax.set_xlabel('False Positive Rate', fontsize=16)
|
| 186 |
+
ax.set_ylabel('True Positive Rate', fontsize=16)
|
| 187 |
+
ax.set_title('Receiver Operating Characteristic (ROC)', fontsize=18)
|
| 188 |
+
ax.tick_params(axis='both', which='major', labelsize=14)
|
| 189 |
+
|
| 190 |
+
# Add gridlines
|
| 191 |
+
ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
|
| 192 |
+
|
| 193 |
+
# Add legend with 3 decimal places
|
| 194 |
+
legend_text = (f'ROC curve (AUC = {roc_auc:.3f}, '
|
| 195 |
+
f'Best F1 = {best_f1:.3f}, '
|
| 196 |
+
f'Best F1 Threshold = {best_thresh:.3f})')
|
| 197 |
+
ax.legend([legend_text], loc="lower right", fontsize=12)
|
| 198 |
+
|
| 199 |
+
# Adjust layout
|
| 200 |
+
plt.tight_layout()
|
| 201 |
+
|
| 202 |
+
# Save and show the figure
|
| 203 |
+
plt.savefig(args.output_file, dpi=300, format='png')
|
| 204 |
+
plt.show()
|
| 205 |
+
|
| 206 |
+
print(f"ROC curve saved to {args.output_file}")
|
| 207 |
+
print(f"Best F1 Score: {best_f1:.3f} at threshold {best_thresh:.3f}")
|
| 208 |
+
|
| 209 |
+
if __name__ == '__main__':
|
| 210 |
+
main()
|
sample_fasta3.3_softmax_error_handling3e.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
sample_fasta3.3_softmax_error_handling3e.py — batch inference for ppiGPLM.
|
| 3 |
+
|
| 4 |
+
Loads a ckpt.pt from <model_dir> (default "out") and runs inference on a file of
|
| 5 |
+
prompts (one prompt per line, no quotes), producing:
|
| 6 |
+
|
| 7 |
+
- <output_prefix>_classifications.txt : FASTA-like dump of model output
|
| 8 |
+
- <output_prefix>_probabilities.csv : per-pair softmax probabilities for "1" and "0"
|
| 9 |
+
|
| 10 |
+
Robustness:
|
| 11 |
+
- Block-size detection from checkpoint[‘model_args’][‘block_size’] (or
|
| 12 |
+
model.config.n_positions for GPT-2 variants).
|
| 13 |
+
- Input clipping: if a prompt exceeds block_size, the head is clipped
|
| 14 |
+
(start_ids = start_ids[-block_size:]) so the label position stays intact.
|
| 15 |
+
- Unknown-token replacement: out-of-vocab characters are mapped to ‘A’.
|
| 16 |
+
|
| 17 |
+
Usage:
|
| 18 |
+
python sample_fasta3.3_softmax_error_handling3e.py \\
|
| 19 |
+
--input_file my_prompts.txt \\
|
| 20 |
+
--output_dir results \\
|
| 21 |
+
--output_prefix myoutput
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import os
|
| 25 |
+
import sys
|
| 26 |
+
import argparse
|
| 27 |
+
from contextlib import nullcontext
|
| 28 |
+
import pickle
|
| 29 |
+
import torch
|
| 30 |
+
import torch.nn.functional as F
|
| 31 |
+
from model import GPTConfig, GPT
|
| 32 |
+
|
| 33 |
+
# -----------------------------------------------------------------------------
|
| 34 |
+
# Parse command-line arguments
|
| 35 |
+
# -----------------------------------------------------------------------------
|
| 36 |
+
parser = argparse.ArgumentParser(description='Sample from a trained model with prompt input.')
|
| 37 |
+
parser.add_argument('--input_file', type=str, default='generated_prompts.txt', help='Path to file containing prompts')
|
| 38 |
+
parser.add_argument('--output_dir', type=str, default='out-ppi', help='Directory to save outputs')
|
| 39 |
+
parser.add_argument('--output_prefix', type=str, default='generated', help='Prefix for output files')
|
| 40 |
+
args = parser.parse_args()
|
| 41 |
+
|
| 42 |
+
# Reset sys.argv for configurator
|
| 43 |
+
sys.argv = [sys.argv[0]]
|
| 44 |
+
|
| 45 |
+
prompts_file_path = args.input_file
|
| 46 |
+
output_dir = args.output_dir
|
| 47 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 48 |
+
|
| 49 |
+
fasta_output_filename = os.path.join(output_dir, args.output_prefix + '_classifications.txt')
|
| 50 |
+
csv_output_filename = os.path.join(output_dir, args.output_prefix + '_probabilities.csv')
|
| 51 |
+
|
| 52 |
+
# -----------------------------------------------------------------------------
|
| 53 |
+
# Sampling parameters and model init overrides
|
| 54 |
+
# -----------------------------------------------------------------------------
|
| 55 |
+
init_from = 'resume' # or a GPT-2 variant
|
| 56 |
+
model_dir = 'out'
|
| 57 |
+
max_new_tokens = 1
|
| 58 |
+
temperature = 0.1
|
| 59 |
+
top_k = 2
|
| 60 |
+
seed = int.from_bytes(os.urandom(4), 'big')
|
| 61 |
+
device = 'cuda'
|
| 62 |
+
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
|
| 63 |
+
compile = False
|
| 64 |
+
exec(open('configurator.py').read()) # overrides from command line or config file
|
| 65 |
+
|
| 66 |
+
torch.manual_seed(seed)
|
| 67 |
+
torch.cuda.manual_seed(seed)
|
| 68 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 69 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 70 |
+
device_type = 'cuda' if 'cuda' in device else 'cpu'
|
| 71 |
+
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
| 72 |
+
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
|
| 73 |
+
|
| 74 |
+
# -----------------------------------------------------------------------------
|
| 75 |
+
# Model Initialization
|
| 76 |
+
# -----------------------------------------------------------------------------
|
| 77 |
+
if init_from == 'resume':
|
| 78 |
+
ckpt_path = os.path.join(model_dir, 'ckpt.pt')
|
| 79 |
+
checkpoint = torch.load(ckpt_path, map_location=device)
|
| 80 |
+
gptconf = GPTConfig(**checkpoint['model_args'])
|
| 81 |
+
model = GPT(gptconf)
|
| 82 |
+
state_dict = checkpoint['model']
|
| 83 |
+
unwanted_prefix = '_orig_mod.'
|
| 84 |
+
for k in list(state_dict.keys()):
|
| 85 |
+
if k.startswith(unwanted_prefix):
|
| 86 |
+
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
| 87 |
+
model.load_state_dict(state_dict)
|
| 88 |
+
block_size = checkpoint['model_args'].get('block_size', 1024)
|
| 89 |
+
else:
|
| 90 |
+
model = GPT.from_pretrained(init_from, dict(dropout=0.0))
|
| 91 |
+
block_size = getattr(model.config, 'n_positions', 1024)
|
| 92 |
+
|
| 93 |
+
model.eval()
|
| 94 |
+
model.to(device)
|
| 95 |
+
if compile:
|
| 96 |
+
model = torch.compile(model)
|
| 97 |
+
|
| 98 |
+
# -----------------------------------------------------------------------------
|
| 99 |
+
# Load vocabulary mapping for character-level tokenization
|
| 100 |
+
# -----------------------------------------------------------------------------
|
| 101 |
+
meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
|
| 102 |
+
if not os.path.exists(meta_path):
|
| 103 |
+
raise FileNotFoundError(f'Meta file not found: {meta_path}')
|
| 104 |
+
with open(meta_path, 'rb') as meta_file:
|
| 105 |
+
meta = pickle.load(meta_file)
|
| 106 |
+
stoi = meta['stoi']
|
| 107 |
+
itos = meta['itos']
|
| 108 |
+
encode = lambda s: [stoi.get(ch, stoi.get('<unk>', 0)) for ch in s]
|
| 109 |
+
decode = lambda l: ''.join([itos.get(i, '') for i in l])
|
| 110 |
+
|
| 111 |
+
# -----------------------------------------------------------------------------
|
| 112 |
+
# Read prompts
|
| 113 |
+
# -----------------------------------------------------------------------------
|
| 114 |
+
with open(prompts_file_path, 'r', encoding='utf-8') as f:
|
| 115 |
+
prompts = [line.strip() for line in f if line.strip()]
|
| 116 |
+
|
| 117 |
+
# -----------------------------------------------------------------------------
|
| 118 |
+
# Token IDs for classification tokens
|
| 119 |
+
# -----------------------------------------------------------------------------
|
| 120 |
+
one_id = encode('1')[0] if encode('1') else None
|
| 121 |
+
zero_id = encode('0')[0] if encode('0') else None
|
| 122 |
+
|
| 123 |
+
# -----------------------------------------------------------------------------
|
| 124 |
+
# FASTA formatting helper
|
| 125 |
+
# -----------------------------------------------------------------------------
|
| 126 |
+
def format_as_fasta(sequence, sample_number):
|
| 127 |
+
return f'>Sample_{sample_number}\n{sequence}\n'
|
| 128 |
+
|
| 129 |
+
# -----------------------------------------------------------------------------
|
| 130 |
+
# Generate outputs and write to files
|
| 131 |
+
# -----------------------------------------------------------------------------
|
| 132 |
+
with open(fasta_output_filename, 'w', encoding='utf-8') as fasta_file, \
|
| 133 |
+
open(csv_output_filename, 'w', encoding='utf-8') as csv_file:
|
| 134 |
+
csv_file.write('l1,Seq1,l2,Seq2,l3,Probability_of_1,Probability_of_0\n')
|
| 135 |
+
with torch.no_grad():
|
| 136 |
+
with ctx:
|
| 137 |
+
for k, prompt in enumerate(prompts):
|
| 138 |
+
start_ids = encode(prompt)
|
| 139 |
+
if len(start_ids) > block_size:
|
| 140 |
+
start_ids = start_ids[-block_size:]
|
| 141 |
+
x = torch.tensor(start_ids, dtype=torch.long, device=device).unsqueeze(0)
|
| 142 |
+
|
| 143 |
+
logits = model(x)
|
| 144 |
+
if isinstance(logits, tuple):
|
| 145 |
+
logits = logits[0]
|
| 146 |
+
last_logits = logits[:, -1, :]
|
| 147 |
+
probs = F.softmax(last_logits, dim=-1)
|
| 148 |
+
|
| 149 |
+
prob_for_1 = probs[0, one_id].item() if one_id is not None and one_id < probs.shape[-1] else 0.0
|
| 150 |
+
prob_for_0 = probs[0, zero_id].item() if zero_id is not None and zero_id < probs.shape[-1] else 0.0
|
| 151 |
+
|
| 152 |
+
y = model.generate(idx=x, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)
|
| 153 |
+
sample_text = decode(y[0].tolist())
|
| 154 |
+
|
| 155 |
+
fasta_file.write(format_as_fasta(sample_text, k) + '\n')
|
| 156 |
+
csv_file.write(f'{prompt},{prob_for_1},{prob_for_0}\n')
|
| 157 |
+
|
| 158 |
+
print(format_as_fasta(sample_text, k))
|
| 159 |
+
print(f'Probability(next_token=1) = {prob_for_1}')
|
| 160 |
+
print(f'Probability(next_token=0) = {prob_for_0}')
|
| 161 |
+
print('---------------')
|
| 162 |
+
|
| 163 |
+
print(f'FASTA-like samples saved to {fasta_output_filename}')
|
| 164 |
+
print(f'CSV with probabilities saved to {csv_output_filename}')
|
train_.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This training script can be run both on a single gpu in debug mode,
|
| 3 |
+
and also in a larger training run with distributed data parallel (ddp).
|
| 4 |
+
|
| 5 |
+
To run on a single GPU, example:
|
| 6 |
+
$ python train.py --batch_size=32 --compile=False
|
| 7 |
+
|
| 8 |
+
To run with DDP on 4 gpus on 1 node, example:
|
| 9 |
+
$ torchrun --standalone --nproc_per_node=4 train.py
|
| 10 |
+
|
| 11 |
+
To run with DDP on 4 gpus across 2 nodes, example:
|
| 12 |
+
- Run on the first (master) node with example IP 123.456.123.456:
|
| 13 |
+
$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
|
| 14 |
+
- Run on the worker node:
|
| 15 |
+
$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
|
| 16 |
+
(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import time
|
| 21 |
+
import math
|
| 22 |
+
import pickle
|
| 23 |
+
from contextlib import nullcontext
|
| 24 |
+
|
| 25 |
+
import numpy as np
|
| 26 |
+
import torch
|
| 27 |
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 28 |
+
from torch.distributed import init_process_group, destroy_process_group
|
| 29 |
+
|
| 30 |
+
from model import GPTConfig, GPT
|
| 31 |
+
|
| 32 |
+
# -----------------------------------------------------------------------------
|
| 33 |
+
# default config values designed to train a gpt2 (124M) on OpenWebText
|
| 34 |
+
# I/O
|
| 35 |
+
out_dir = 'out'
|
| 36 |
+
eval_interval = 2000
|
| 37 |
+
log_interval = 1
|
| 38 |
+
eval_iters = 200
|
| 39 |
+
eval_only = False # if True, script exits right after the first eval
|
| 40 |
+
always_save_checkpoint = True # if True, always save a checkpoint after each eval
|
| 41 |
+
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
|
| 42 |
+
# wandb logging
|
| 43 |
+
wandb_log = False # disabled by default
|
| 44 |
+
wandb_project = 'owt'
|
| 45 |
+
wandb_run_name = 'gpt2' # 'run' + str(time.time())
|
| 46 |
+
# data
|
| 47 |
+
dataset = 'openwebtext'
|
| 48 |
+
gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
|
| 49 |
+
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
|
| 50 |
+
block_size = 1024
|
| 51 |
+
# model
|
| 52 |
+
n_layer = 12
|
| 53 |
+
n_head = 12
|
| 54 |
+
n_embd = 768
|
| 55 |
+
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
|
| 56 |
+
bias = False # do we use bias inside LayerNorm and Linear layers?
|
| 57 |
+
# adamw optimizer
|
| 58 |
+
learning_rate = 6e-4 # max learning rate
|
| 59 |
+
max_iters = 600000 # total number of training iterations
|
| 60 |
+
weight_decay = 1e-1
|
| 61 |
+
beta1 = 0.9
|
| 62 |
+
beta2 = 0.95
|
| 63 |
+
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
| 64 |
+
# learning rate decay settings
|
| 65 |
+
decay_lr = True # whether to decay the learning rate
|
| 66 |
+
warmup_iters = 2000 # how many steps to warm up for
|
| 67 |
+
lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla
|
| 68 |
+
min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
|
| 69 |
+
# DDP settings
|
| 70 |
+
backend = 'nccl' # 'nccl', 'gloo', etc.
|
| 71 |
+
# system
|
| 72 |
+
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
|
| 73 |
+
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
| 74 |
+
compile = True # use PyTorch 2.0 to compile the model to be faster
|
| 75 |
+
# -----------------------------------------------------------------------------
|
| 76 |
+
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
|
| 77 |
+
exec(open('configurator.py').read()) # overrides from command line or config file
|
| 78 |
+
config = {k: globals()[k] for k in config_keys} # will be useful for logging
|
| 79 |
+
# -----------------------------------------------------------------------------
|
| 80 |
+
|
| 81 |
+
# various inits, derived attributes, I/O setup
|
| 82 |
+
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
|
| 83 |
+
if ddp:
|
| 84 |
+
init_process_group(backend=backend)
|
| 85 |
+
ddp_rank = int(os.environ['RANK'])
|
| 86 |
+
ddp_local_rank = int(os.environ['LOCAL_RANK'])
|
| 87 |
+
ddp_world_size = int(os.environ['WORLD_SIZE'])
|
| 88 |
+
device = f'cuda:{ddp_local_rank}'
|
| 89 |
+
torch.cuda.set_device(device)
|
| 90 |
+
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
| 91 |
+
seed_offset = ddp_rank # each process gets a different seed
|
| 92 |
+
# world_size number of processes will be training simultaneously, so we can scale
|
| 93 |
+
# down the desired gradient accumulation iterations per process proportionally
|
| 94 |
+
assert gradient_accumulation_steps % ddp_world_size == 0
|
| 95 |
+
gradient_accumulation_steps //= ddp_world_size
|
| 96 |
+
else:
|
| 97 |
+
# if not ddp, we are running on a single gpu, and one process
|
| 98 |
+
master_process = True
|
| 99 |
+
seed_offset = 0
|
| 100 |
+
ddp_world_size = 1
|
| 101 |
+
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
|
| 102 |
+
print(f"tokens per iteration will be: {tokens_per_iter:,}")
|
| 103 |
+
|
| 104 |
+
if master_process:
|
| 105 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 106 |
+
torch.manual_seed(1337 + seed_offset)
|
| 107 |
+
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
| 108 |
+
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
| 109 |
+
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
|
| 110 |
+
# note: float16 data type will automatically use a GradScaler
|
| 111 |
+
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
| 112 |
+
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
|
| 113 |
+
|
| 114 |
+
# poor man's data loader
|
| 115 |
+
data_dir = os.path.join('data', dataset)
|
| 116 |
+
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
|
| 117 |
+
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
|
| 118 |
+
def get_batch(split):
|
| 119 |
+
data = train_data if split == 'train' else val_data
|
| 120 |
+
ix = torch.randint(len(data) - block_size, (batch_size,))
|
| 121 |
+
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
|
| 122 |
+
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
|
| 123 |
+
if device_type == 'cuda':
|
| 124 |
+
# pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
|
| 125 |
+
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
|
| 126 |
+
else:
|
| 127 |
+
x, y = x.to(device), y.to(device)
|
| 128 |
+
return x, y
|
| 129 |
+
|
| 130 |
+
# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
|
| 131 |
+
iter_num = 0
|
| 132 |
+
best_val_loss = 1e9
|
| 133 |
+
|
| 134 |
+
# attempt to derive vocab_size from the dataset
|
| 135 |
+
meta_path = os.path.join(data_dir, 'meta.pkl')
|
| 136 |
+
meta_vocab_size = None
|
| 137 |
+
if os.path.exists(meta_path):
|
| 138 |
+
with open(meta_path, 'rb') as f:
|
| 139 |
+
meta = pickle.load(f)
|
| 140 |
+
meta_vocab_size = meta['vocab_size']
|
| 141 |
+
print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
|
| 142 |
+
|
| 143 |
+
# model init
|
| 144 |
+
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
|
| 145 |
+
bias=bias, vocab_size=None, dropout=dropout) # start with model_args from command line
|
| 146 |
+
if init_from == 'scratch':
|
| 147 |
+
# init a new model from scratch
|
| 148 |
+
print("Initializing a new model from scratch")
|
| 149 |
+
# determine the vocab size we'll use for from-scratch training
|
| 150 |
+
if meta_vocab_size is None:
|
| 151 |
+
print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
|
| 152 |
+
model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
|
| 153 |
+
gptconf = GPTConfig(**model_args)
|
| 154 |
+
model = GPT(gptconf)
|
| 155 |
+
elif init_from == 'resume':
|
| 156 |
+
print(f"Resuming training from {out_dir}")
|
| 157 |
+
# resume training from a checkpoint.
|
| 158 |
+
ckpt_path = os.path.join(out_dir, 'ckpt.pt')
|
| 159 |
+
checkpoint = torch.load(ckpt_path, map_location=device)
|
| 160 |
+
checkpoint_model_args = checkpoint['model_args']
|
| 161 |
+
# force these config attributes to be equal otherwise we can't even resume training
|
| 162 |
+
# the rest of the attributes (e.g. dropout) can stay as desired from command line
|
| 163 |
+
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
|
| 164 |
+
model_args[k] = checkpoint_model_args[k]
|
| 165 |
+
# create the model
|
| 166 |
+
gptconf = GPTConfig(**model_args)
|
| 167 |
+
model = GPT(gptconf)
|
| 168 |
+
state_dict = checkpoint['model']
|
| 169 |
+
# fix the keys of the state dictionary :(
|
| 170 |
+
# honestly no idea how checkpoints sometimes get this prefix, have to debug more
|
| 171 |
+
unwanted_prefix = '_orig_mod.'
|
| 172 |
+
for k,v in list(state_dict.items()):
|
| 173 |
+
if k.startswith(unwanted_prefix):
|
| 174 |
+
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
| 175 |
+
model.load_state_dict(state_dict)
|
| 176 |
+
iter_num = checkpoint['iter_num']
|
| 177 |
+
best_val_loss = checkpoint['best_val_loss']
|
| 178 |
+
elif init_from.startswith('gpt2'):
|
| 179 |
+
print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
|
| 180 |
+
# initialize from OpenAI GPT-2 weights
|
| 181 |
+
override_args = dict(dropout=dropout)
|
| 182 |
+
model = GPT.from_pretrained(init_from, override_args)
|
| 183 |
+
# read off the created config params, so we can store them into checkpoint correctly
|
| 184 |
+
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
|
| 185 |
+
model_args[k] = getattr(model.config, k)
|
| 186 |
+
# crop down the model block size if desired, using model surgery
|
| 187 |
+
if block_size < model.config.block_size:
|
| 188 |
+
model.crop_block_size(block_size)
|
| 189 |
+
model_args['block_size'] = block_size # so that the checkpoint will have the right value
|
| 190 |
+
model.to(device)
|
| 191 |
+
|
| 192 |
+
# initialize a GradScaler. If enabled=False scaler is a no-op
|
| 193 |
+
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
|
| 194 |
+
|
| 195 |
+
# optimizer
|
| 196 |
+
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
|
| 197 |
+
if init_from == 'resume':
|
| 198 |
+
optimizer.load_state_dict(checkpoint['optimizer'])
|
| 199 |
+
checkpoint = None # free up memory
|
| 200 |
+
|
| 201 |
+
# compile the model
|
| 202 |
+
if compile:
|
| 203 |
+
print("compiling the model... (takes a ~minute)")
|
| 204 |
+
unoptimized_model = model
|
| 205 |
+
model = torch.compile(model) # requires PyTorch 2.0
|
| 206 |
+
|
| 207 |
+
# wrap model into DDP container
|
| 208 |
+
if ddp:
|
| 209 |
+
model = DDP(model, device_ids=[ddp_local_rank])
|
| 210 |
+
|
| 211 |
+
# helps estimate an arbitrarily accurate loss over either split using many batches
|
| 212 |
+
@torch.no_grad()
|
| 213 |
+
def estimate_loss():
|
| 214 |
+
out = {}
|
| 215 |
+
model.eval()
|
| 216 |
+
for split in ['train', 'val']:
|
| 217 |
+
losses = torch.zeros(eval_iters)
|
| 218 |
+
for k in range(eval_iters):
|
| 219 |
+
X, Y = get_batch(split)
|
| 220 |
+
with ctx:
|
| 221 |
+
logits, loss = model(X, Y)
|
| 222 |
+
losses[k] = loss.item()
|
| 223 |
+
out[split] = losses.mean()
|
| 224 |
+
model.train()
|
| 225 |
+
return out
|
| 226 |
+
|
| 227 |
+
# learning rate decay scheduler (cosine with warmup)
|
| 228 |
+
def get_lr(it):
|
| 229 |
+
# 1) linear warmup for warmup_iters steps
|
| 230 |
+
if it < warmup_iters:
|
| 231 |
+
return learning_rate * it / warmup_iters
|
| 232 |
+
# 2) if it > lr_decay_iters, return min learning rate
|
| 233 |
+
if it > lr_decay_iters:
|
| 234 |
+
return min_lr
|
| 235 |
+
# 3) in between, use cosine decay down to min learning rate
|
| 236 |
+
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
| 237 |
+
assert 0 <= decay_ratio <= 1
|
| 238 |
+
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
|
| 239 |
+
return min_lr + coeff * (learning_rate - min_lr)
|
| 240 |
+
|
| 241 |
+
# logging
|
| 242 |
+
if wandb_log and master_process:
|
| 243 |
+
import wandb
|
| 244 |
+
wandb.init(project=wandb_project, name=wandb_run_name, config=config)
|
| 245 |
+
|
| 246 |
+
# training loop
|
| 247 |
+
X, Y = get_batch('train') # fetch the very first batch
|
| 248 |
+
t0 = time.time()
|
| 249 |
+
local_iter_num = 0 # number of iterations in the lifetime of this process
|
| 250 |
+
raw_model = model.module if ddp else model # unwrap DDP container if needed
|
| 251 |
+
running_mfu = -1.0
|
| 252 |
+
while True:
|
| 253 |
+
|
| 254 |
+
# determine and set the learning rate for this iteration
|
| 255 |
+
lr = get_lr(iter_num) if decay_lr else learning_rate
|
| 256 |
+
for param_group in optimizer.param_groups:
|
| 257 |
+
param_group['lr'] = lr
|
| 258 |
+
|
| 259 |
+
# evaluate the loss on train/val sets and write checkpoints
|
| 260 |
+
if iter_num % eval_interval == 0 and master_process:
|
| 261 |
+
losses = estimate_loss()
|
| 262 |
+
print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
| 263 |
+
if wandb_log:
|
| 264 |
+
wandb.log({
|
| 265 |
+
"iter": iter_num,
|
| 266 |
+
"train/loss": losses['train'],
|
| 267 |
+
"val/loss": losses['val'],
|
| 268 |
+
"lr": lr,
|
| 269 |
+
"mfu": running_mfu*100, # convert to percentage
|
| 270 |
+
})
|
| 271 |
+
if losses['val'] < best_val_loss or always_save_checkpoint:
|
| 272 |
+
best_val_loss = losses['val']
|
| 273 |
+
if iter_num > 0:
|
| 274 |
+
checkpoint = {
|
| 275 |
+
'model': raw_model.state_dict(),
|
| 276 |
+
'optimizer': optimizer.state_dict(),
|
| 277 |
+
'model_args': model_args,
|
| 278 |
+
'iter_num': iter_num,
|
| 279 |
+
'best_val_loss': best_val_loss,
|
| 280 |
+
'config': config,
|
| 281 |
+
}
|
| 282 |
+
print(f"saving checkpoint to {out_dir}")
|
| 283 |
+
torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
|
| 284 |
+
if iter_num == 0 and eval_only:
|
| 285 |
+
break
|
| 286 |
+
|
| 287 |
+
# forward backward update, with optional gradient accumulation to simulate larger batch size
|
| 288 |
+
# and using the GradScaler if data type is float16
|
| 289 |
+
for micro_step in range(gradient_accumulation_steps):
|
| 290 |
+
if ddp:
|
| 291 |
+
# in DDP training we only need to sync gradients at the last micro step.
|
| 292 |
+
# the official way to do this is with model.no_sync() context manager, but
|
| 293 |
+
# I really dislike that this bloats the code and forces us to repeat code
|
| 294 |
+
# looking at the source of that context manager, it just toggles this variable
|
| 295 |
+
model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
|
| 296 |
+
with ctx:
|
| 297 |
+
logits, loss = model(X, Y)
|
| 298 |
+
loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
|
| 299 |
+
# immediately async prefetch next batch while model is doing the forward pass on the GPU
|
| 300 |
+
X, Y = get_batch('train')
|
| 301 |
+
# backward pass, with gradient scaling if training in fp16
|
| 302 |
+
scaler.scale(loss).backward()
|
| 303 |
+
# clip the gradient
|
| 304 |
+
if grad_clip != 0.0:
|
| 305 |
+
scaler.unscale_(optimizer)
|
| 306 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
| 307 |
+
# step the optimizer and scaler if training in fp16
|
| 308 |
+
scaler.step(optimizer)
|
| 309 |
+
scaler.update()
|
| 310 |
+
# flush the gradients as soon as we can, no need for this memory anymore
|
| 311 |
+
optimizer.zero_grad(set_to_none=True)
|
| 312 |
+
|
| 313 |
+
# timing and logging
|
| 314 |
+
t1 = time.time()
|
| 315 |
+
dt = t1 - t0
|
| 316 |
+
t0 = t1
|
| 317 |
+
if iter_num % log_interval == 0 and master_process:
|
| 318 |
+
# get loss as float. note: this is a CPU-GPU sync point
|
| 319 |
+
# scale up to undo the division above, approximating the true total loss (exact would have been a sum)
|
| 320 |
+
lossf = loss.item() * gradient_accumulation_steps
|
| 321 |
+
if local_iter_num >= 5: # let the training loop settle a bit
|
| 322 |
+
mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
|
| 323 |
+
running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
|
| 324 |
+
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
|
| 325 |
+
iter_num += 1
|
| 326 |
+
local_iter_num += 1
|
| 327 |
+
|
| 328 |
+
# termination conditions
|
| 329 |
+
if iter_num > max_iters:
|
| 330 |
+
break
|
| 331 |
+
|
| 332 |
+
if ddp:
|
| 333 |
+
destroy_process_group()
|