Spaces:
Sleeping
Sleeping
cao commited on
Commit ·
78f28d5
1
Parent(s): d5362a7
Add model and predictor files
Browse files- src/HLA_dict.npy +3 -0
- src/aa_properties_aaindex.py +514 -0
- src/library/.DS_Store +0 -0
- src/library/hla_library/A_prot.fasta +3 -0
- src/library/hla_library/B_prot.fasta +3 -0
- src/library/hla_library/C_prot.fasta +3 -0
- src/library/hla_library/E_prot.fasta +3 -0
- src/library/hla_prot.fasta +3 -0
- src/library/trajs_aa.tsv +65 -0
- src/library/trajs_nt.tsv +61 -0
- src/library/travs_aa.tsv +110 -0
- src/library/travs_nt.tsv +110 -0
- src/library/trbjs_aa.tsv +15 -0
- src/library/trbjs_nt.tsv +15 -0
- src/library/trbvs_aa.tsv +115 -0
- src/library/trbvs_nt.tsv +106 -0
- src/main.py +1423 -0
- src/model.pt +3 -0
- src/model.py +1995 -0
- src/phla_cache/hla_coord_dict.pt +3 -0
- src/phla_cache/hla_feat_dict.pt +3 -0
- src/physicochemical.py +293 -0
- src/predictor.py +28 -0
- src/streamlit_app.py +58 -39
- src/streamlit_app0.py +40 -0
src/HLA_dict.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2cee1e02e5548817e06e60b258f57dd27a3707dff357656cd956cab81adf2e6
|
| 3 |
+
size 3287055
|
src/aa_properties_aaindex.py
ADDED
|
@@ -0,0 +1,514 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Amino Acid Properties from AAindex Database
|
| 3 |
+
Auto-generated by AAindexDownloader
|
| 4 |
+
|
| 5 |
+
Total features: 20
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
# Raw values from AAindex
|
| 11 |
+
AA_PROPERTIES_AAINDEX = {
|
| 12 |
+
'A': {
|
| 13 |
+
'BIGC670101': 52.600000,
|
| 14 |
+
'CHAM820101': 0.046000,
|
| 15 |
+
'CHOP780201': 1.420000,
|
| 16 |
+
'CHOP780202': 0.830000,
|
| 17 |
+
'CHOP780203': 0.740000,
|
| 18 |
+
'EISD860101': 0.670000,
|
| 19 |
+
'FASG760101': 89.090000,
|
| 20 |
+
'FAUJ830101': 0.310000,
|
| 21 |
+
'GRAR740102': 8.100000,
|
| 22 |
+
'GRAR740103': 31.000000,
|
| 23 |
+
'GUYH850101': 0.100000,
|
| 24 |
+
'HOPT810101': -0.500000,
|
| 25 |
+
'JANJ780101': 27.800000,
|
| 26 |
+
'KARP850101': 1.041000,
|
| 27 |
+
'KYTJ820101': 1.800000,
|
| 28 |
+
'ROSM880101': -0.670000,
|
| 29 |
+
'VINM940101': 0.984000,
|
| 30 |
+
'WERD780101': 0.520000,
|
| 31 |
+
'ZIMJ680101': 0.830000,
|
| 32 |
+
'ZIMJ680104': 6.000000,
|
| 33 |
+
},
|
| 34 |
+
'R': {
|
| 35 |
+
'BIGC670101': 109.100000,
|
| 36 |
+
'CHAM820101': 0.291000,
|
| 37 |
+
'CHOP780201': 0.980000,
|
| 38 |
+
'CHOP780202': 0.930000,
|
| 39 |
+
'CHOP780203': 1.010000,
|
| 40 |
+
'EISD860101': -2.100000,
|
| 41 |
+
'FASG760101': 174.200000,
|
| 42 |
+
'FAUJ830101': -1.010000,
|
| 43 |
+
'GRAR740102': 10.500000,
|
| 44 |
+
'GRAR740103': 124.000000,
|
| 45 |
+
'GUYH850101': 1.910000,
|
| 46 |
+
'HOPT810101': 3.000000,
|
| 47 |
+
'JANJ780101': 94.700000,
|
| 48 |
+
'KARP850101': 1.038000,
|
| 49 |
+
'KYTJ820101': -4.500000,
|
| 50 |
+
'ROSM880101': 12.100000,
|
| 51 |
+
'VINM940101': 1.008000,
|
| 52 |
+
'WERD780101': 0.490000,
|
| 53 |
+
'ZIMJ680101': 0.830000,
|
| 54 |
+
'ZIMJ680104': 10.760000,
|
| 55 |
+
},
|
| 56 |
+
'N': {
|
| 57 |
+
'BIGC670101': 75.700000,
|
| 58 |
+
'CHAM820101': 0.134000,
|
| 59 |
+
'CHOP780201': 0.670000,
|
| 60 |
+
'CHOP780202': 0.890000,
|
| 61 |
+
'CHOP780203': 1.460000,
|
| 62 |
+
'EISD860101': -0.600000,
|
| 63 |
+
'FASG760101': 132.120000,
|
| 64 |
+
'FAUJ830101': -0.600000,
|
| 65 |
+
'GRAR740102': 11.600000,
|
| 66 |
+
'GRAR740103': 56.000000,
|
| 67 |
+
'GUYH850101': 0.480000,
|
| 68 |
+
'HOPT810101': 0.200000,
|
| 69 |
+
'JANJ780101': 60.100000,
|
| 70 |
+
'KARP850101': 1.117000,
|
| 71 |
+
'KYTJ820101': -3.500000,
|
| 72 |
+
'ROSM880101': 7.230000,
|
| 73 |
+
'VINM940101': 1.048000,
|
| 74 |
+
'WERD780101': 0.420000,
|
| 75 |
+
'ZIMJ680101': 0.090000,
|
| 76 |
+
'ZIMJ680104': 5.410000,
|
| 77 |
+
},
|
| 78 |
+
'D': {
|
| 79 |
+
'BIGC670101': 68.400000,
|
| 80 |
+
'CHAM820101': 0.105000,
|
| 81 |
+
'CHOP780201': 1.010000,
|
| 82 |
+
'CHOP780202': 0.540000,
|
| 83 |
+
'CHOP780203': 1.520000,
|
| 84 |
+
'EISD860101': -1.200000,
|
| 85 |
+
'FASG760101': 133.100000,
|
| 86 |
+
'FAUJ830101': -0.770000,
|
| 87 |
+
'GRAR740102': 13.000000,
|
| 88 |
+
'GRAR740103': 54.000000,
|
| 89 |
+
'GUYH850101': 0.780000,
|
| 90 |
+
'HOPT810101': 3.000000,
|
| 91 |
+
'JANJ780101': 60.600000,
|
| 92 |
+
'KARP850101': 1.033000,
|
| 93 |
+
'KYTJ820101': -3.500000,
|
| 94 |
+
'ROSM880101': 8.720000,
|
| 95 |
+
'VINM940101': 1.068000,
|
| 96 |
+
'WERD780101': 0.370000,
|
| 97 |
+
'ZIMJ680101': 0.640000,
|
| 98 |
+
'ZIMJ680104': 2.770000,
|
| 99 |
+
},
|
| 100 |
+
'C': {
|
| 101 |
+
'BIGC670101': 68.300000,
|
| 102 |
+
'CHAM820101': 0.128000,
|
| 103 |
+
'CHOP780201': 0.700000,
|
| 104 |
+
'CHOP780202': 1.190000,
|
| 105 |
+
'CHOP780203': 0.960000,
|
| 106 |
+
'EISD860101': 0.380000,
|
| 107 |
+
'FASG760101': 121.150000,
|
| 108 |
+
'FAUJ830101': 1.540000,
|
| 109 |
+
'GRAR740102': 5.500000,
|
| 110 |
+
'GRAR740103': 55.000000,
|
| 111 |
+
'GUYH850101': -1.420000,
|
| 112 |
+
'HOPT810101': -1.000000,
|
| 113 |
+
'JANJ780101': 15.500000,
|
| 114 |
+
'KARP850101': 0.960000,
|
| 115 |
+
'KYTJ820101': 2.500000,
|
| 116 |
+
'ROSM880101': -0.340000,
|
| 117 |
+
'VINM940101': 0.906000,
|
| 118 |
+
'WERD780101': 0.830000,
|
| 119 |
+
'ZIMJ680101': 1.480000,
|
| 120 |
+
'ZIMJ680104': 5.050000,
|
| 121 |
+
},
|
| 122 |
+
'Q': {
|
| 123 |
+
'BIGC670101': 89.700000,
|
| 124 |
+
'CHAM820101': 0.180000,
|
| 125 |
+
'CHOP780201': 1.110000,
|
| 126 |
+
'CHOP780202': 1.100000,
|
| 127 |
+
'CHOP780203': 0.960000,
|
| 128 |
+
'EISD860101': -0.220000,
|
| 129 |
+
'FASG760101': 146.150000,
|
| 130 |
+
'FAUJ830101': -0.220000,
|
| 131 |
+
'GRAR740102': 10.500000,
|
| 132 |
+
'GRAR740103': 85.000000,
|
| 133 |
+
'GUYH850101': 0.950000,
|
| 134 |
+
'HOPT810101': 0.200000,
|
| 135 |
+
'JANJ780101': 68.700000,
|
| 136 |
+
'KARP850101': 1.165000,
|
| 137 |
+
'KYTJ820101': -3.500000,
|
| 138 |
+
'ROSM880101': 6.390000,
|
| 139 |
+
'VINM940101': 1.037000,
|
| 140 |
+
'WERD780101': 0.350000,
|
| 141 |
+
'ZIMJ680101': 0.000000,
|
| 142 |
+
'ZIMJ680104': 5.650000,
|
| 143 |
+
},
|
| 144 |
+
'E': {
|
| 145 |
+
'BIGC670101': 84.700000,
|
| 146 |
+
'CHAM820101': 0.151000,
|
| 147 |
+
'CHOP780201': 1.510000,
|
| 148 |
+
'CHOP780202': 0.370000,
|
| 149 |
+
'CHOP780203': 0.950000,
|
| 150 |
+
'EISD860101': -0.760000,
|
| 151 |
+
'FASG760101': 147.130000,
|
| 152 |
+
'FAUJ830101': -0.640000,
|
| 153 |
+
'GRAR740102': 12.300000,
|
| 154 |
+
'GRAR740103': 83.000000,
|
| 155 |
+
'GUYH850101': 0.830000,
|
| 156 |
+
'HOPT810101': 3.000000,
|
| 157 |
+
'JANJ780101': 68.200000,
|
| 158 |
+
'KARP850101': 1.094000,
|
| 159 |
+
'KYTJ820101': -3.500000,
|
| 160 |
+
'ROSM880101': 7.350000,
|
| 161 |
+
'VINM940101': 1.094000,
|
| 162 |
+
'WERD780101': 0.380000,
|
| 163 |
+
'ZIMJ680101': 0.650000,
|
| 164 |
+
'ZIMJ680104': 3.220000,
|
| 165 |
+
},
|
| 166 |
+
'G': {
|
| 167 |
+
'BIGC670101': 36.300000,
|
| 168 |
+
'CHAM820101': 0.000000,
|
| 169 |
+
'CHOP780201': 0.570000,
|
| 170 |
+
'CHOP780202': 0.750000,
|
| 171 |
+
'CHOP780203': 1.560000,
|
| 172 |
+
'EISD860101': 0.000000,
|
| 173 |
+
'FASG760101': 75.070000,
|
| 174 |
+
'FAUJ830101': 0.000000,
|
| 175 |
+
'GRAR740102': 9.000000,
|
| 176 |
+
'GRAR740103': 3.000000,
|
| 177 |
+
'GUYH850101': 0.330000,
|
| 178 |
+
'HOPT810101': 0.000000,
|
| 179 |
+
'JANJ780101': 24.500000,
|
| 180 |
+
'KARP850101': 1.142000,
|
| 181 |
+
'KYTJ820101': -0.400000,
|
| 182 |
+
'ROSM880101': 0.000000,
|
| 183 |
+
'VINM940101': 1.031000,
|
| 184 |
+
'WERD780101': 0.410000,
|
| 185 |
+
'ZIMJ680101': 0.100000,
|
| 186 |
+
'ZIMJ680104': 5.970000,
|
| 187 |
+
},
|
| 188 |
+
'H': {
|
| 189 |
+
'BIGC670101': 91.900000,
|
| 190 |
+
'CHAM820101': 0.230000,
|
| 191 |
+
'CHOP780201': 1.000000,
|
| 192 |
+
'CHOP780202': 0.870000,
|
| 193 |
+
'CHOP780203': 0.950000,
|
| 194 |
+
'EISD860101': 0.640000,
|
| 195 |
+
'FASG760101': 155.160000,
|
| 196 |
+
'FAUJ830101': 0.130000,
|
| 197 |
+
'GRAR740102': 10.400000,
|
| 198 |
+
'GRAR740103': 96.000000,
|
| 199 |
+
'GUYH850101': -0.500000,
|
| 200 |
+
'HOPT810101': -0.500000,
|
| 201 |
+
'JANJ780101': 50.700000,
|
| 202 |
+
'KARP850101': 0.982000,
|
| 203 |
+
'KYTJ820101': -3.200000,
|
| 204 |
+
'ROSM880101': 3.820000,
|
| 205 |
+
'VINM940101': 0.950000,
|
| 206 |
+
'WERD780101': 0.700000,
|
| 207 |
+
'ZIMJ680101': 1.100000,
|
| 208 |
+
'ZIMJ680104': 7.590000,
|
| 209 |
+
},
|
| 210 |
+
'I': {
|
| 211 |
+
'BIGC670101': 102.000000,
|
| 212 |
+
'CHAM820101': 0.186000,
|
| 213 |
+
'CHOP780201': 1.080000,
|
| 214 |
+
'CHOP780202': 1.600000,
|
| 215 |
+
'CHOP780203': 0.470000,
|
| 216 |
+
'EISD860101': 1.900000,
|
| 217 |
+
'FASG760101': 131.170000,
|
| 218 |
+
'FAUJ830101': 1.800000,
|
| 219 |
+
'GRAR740102': 5.200000,
|
| 220 |
+
'GRAR740103': 111.000000,
|
| 221 |
+
'GUYH850101': -1.130000,
|
| 222 |
+
'HOPT810101': -1.800000,
|
| 223 |
+
'JANJ780101': 22.800000,
|
| 224 |
+
'KARP850101': 1.002000,
|
| 225 |
+
'KYTJ820101': 4.500000,
|
| 226 |
+
'ROSM880101': -3.020000,
|
| 227 |
+
'VINM940101': 0.927000,
|
| 228 |
+
'WERD780101': 0.790000,
|
| 229 |
+
'ZIMJ680101': 3.070000,
|
| 230 |
+
'ZIMJ680104': 6.020000,
|
| 231 |
+
},
|
| 232 |
+
'L': {
|
| 233 |
+
'BIGC670101': 102.000000,
|
| 234 |
+
'CHAM820101': 0.186000,
|
| 235 |
+
'CHOP780201': 1.210000,
|
| 236 |
+
'CHOP780202': 1.300000,
|
| 237 |
+
'CHOP780203': 0.500000,
|
| 238 |
+
'EISD860101': 1.900000,
|
| 239 |
+
'FASG760101': 131.170000,
|
| 240 |
+
'FAUJ830101': 1.700000,
|
| 241 |
+
'GRAR740102': 4.900000,
|
| 242 |
+
'GRAR740103': 111.000000,
|
| 243 |
+
'GUYH850101': -1.180000,
|
| 244 |
+
'HOPT810101': -1.800000,
|
| 245 |
+
'JANJ780101': 27.600000,
|
| 246 |
+
'KARP850101': 0.967000,
|
| 247 |
+
'KYTJ820101': 3.800000,
|
| 248 |
+
'ROSM880101': -3.020000,
|
| 249 |
+
'VINM940101': 0.935000,
|
| 250 |
+
'WERD780101': 0.770000,
|
| 251 |
+
'ZIMJ680101': 2.520000,
|
| 252 |
+
'ZIMJ680104': 5.980000,
|
| 253 |
+
},
|
| 254 |
+
'K': {
|
| 255 |
+
'BIGC670101': 105.100000,
|
| 256 |
+
'CHAM820101': 0.219000,
|
| 257 |
+
'CHOP780201': 1.160000,
|
| 258 |
+
'CHOP780202': 0.740000,
|
| 259 |
+
'CHOP780203': 1.190000,
|
| 260 |
+
'EISD860101': -0.570000,
|
| 261 |
+
'FASG760101': 146.190000,
|
| 262 |
+
'FAUJ830101': -0.990000,
|
| 263 |
+
'GRAR740102': 11.300000,
|
| 264 |
+
'GRAR740103': 119.000000,
|
| 265 |
+
'GUYH850101': 1.400000,
|
| 266 |
+
'HOPT810101': 3.000000,
|
| 267 |
+
'JANJ780101': 103.000000,
|
| 268 |
+
'KARP850101': 1.093000,
|
| 269 |
+
'KYTJ820101': -3.900000,
|
| 270 |
+
'ROSM880101': 6.130000,
|
| 271 |
+
'VINM940101': 1.102000,
|
| 272 |
+
'WERD780101': 0.310000,
|
| 273 |
+
'ZIMJ680101': 1.600000,
|
| 274 |
+
'ZIMJ680104': 9.740000,
|
| 275 |
+
},
|
| 276 |
+
'M': {
|
| 277 |
+
'BIGC670101': 97.700000,
|
| 278 |
+
'CHAM820101': 0.221000,
|
| 279 |
+
'CHOP780201': 1.450000,
|
| 280 |
+
'CHOP780202': 1.050000,
|
| 281 |
+
'CHOP780203': 0.600000,
|
| 282 |
+
'EISD860101': 2.400000,
|
| 283 |
+
'FASG760101': 149.210000,
|
| 284 |
+
'FAUJ830101': 1.230000,
|
| 285 |
+
'GRAR740102': 5.700000,
|
| 286 |
+
'GRAR740103': 105.000000,
|
| 287 |
+
'GUYH850101': -1.590000,
|
| 288 |
+
'HOPT810101': -1.300000,
|
| 289 |
+
'JANJ780101': 33.500000,
|
| 290 |
+
'KARP850101': 0.947000,
|
| 291 |
+
'KYTJ820101': 1.900000,
|
| 292 |
+
'ROSM880101': -1.300000,
|
| 293 |
+
'VINM940101': 0.952000,
|
| 294 |
+
'WERD780101': 0.760000,
|
| 295 |
+
'ZIMJ680101': 1.400000,
|
| 296 |
+
'ZIMJ680104': 5.740000,
|
| 297 |
+
},
|
| 298 |
+
'F': {
|
| 299 |
+
'BIGC670101': 113.900000,
|
| 300 |
+
'CHAM820101': 0.290000,
|
| 301 |
+
'CHOP780201': 1.130000,
|
| 302 |
+
'CHOP780202': 1.380000,
|
| 303 |
+
'CHOP780203': 0.660000,
|
| 304 |
+
'EISD860101': 2.300000,
|
| 305 |
+
'FASG760101': 165.190000,
|
| 306 |
+
'FAUJ830101': 1.790000,
|
| 307 |
+
'GRAR740102': 5.200000,
|
| 308 |
+
'GRAR740103': 132.000000,
|
| 309 |
+
'GUYH850101': -2.120000,
|
| 310 |
+
'HOPT810101': -2.500000,
|
| 311 |
+
'JANJ780101': 25.500000,
|
| 312 |
+
'KARP850101': 0.930000,
|
| 313 |
+
'KYTJ820101': 2.800000,
|
| 314 |
+
'ROSM880101': -3.240000,
|
| 315 |
+
'VINM940101': 0.915000,
|
| 316 |
+
'WERD780101': 0.870000,
|
| 317 |
+
'ZIMJ680101': 2.750000,
|
| 318 |
+
'ZIMJ680104': 5.480000,
|
| 319 |
+
},
|
| 320 |
+
'P': {
|
| 321 |
+
'BIGC670101': 73.600000,
|
| 322 |
+
'CHAM820101': 0.131000,
|
| 323 |
+
'CHOP780201': 0.570000,
|
| 324 |
+
'CHOP780202': 0.550000,
|
| 325 |
+
'CHOP780203': 1.560000,
|
| 326 |
+
'EISD860101': 1.200000,
|
| 327 |
+
'FASG760101': 115.130000,
|
| 328 |
+
'FAUJ830101': 0.720000,
|
| 329 |
+
'GRAR740102': 8.000000,
|
| 330 |
+
'GRAR740103': 32.500000,
|
| 331 |
+
'GUYH850101': 0.730000,
|
| 332 |
+
'HOPT810101': 0.000000,
|
| 333 |
+
'JANJ780101': 51.500000,
|
| 334 |
+
'KARP850101': 1.055000,
|
| 335 |
+
'KYTJ820101': -1.600000,
|
| 336 |
+
'ROSM880101': -1.750000,
|
| 337 |
+
'VINM940101': 1.049000,
|
| 338 |
+
'WERD780101': 0.350000,
|
| 339 |
+
'ZIMJ680101': 2.700000,
|
| 340 |
+
'ZIMJ680104': 6.300000,
|
| 341 |
+
},
|
| 342 |
+
'S': {
|
| 343 |
+
'BIGC670101': 54.900000,
|
| 344 |
+
'CHAM820101': 0.062000,
|
| 345 |
+
'CHOP780201': 0.770000,
|
| 346 |
+
'CHOP780202': 0.750000,
|
| 347 |
+
'CHOP780203': 1.430000,
|
| 348 |
+
'EISD860101': 0.010000,
|
| 349 |
+
'FASG760101': 105.090000,
|
| 350 |
+
'FAUJ830101': -0.040000,
|
| 351 |
+
'GRAR740102': 9.200000,
|
| 352 |
+
'GRAR740103': 32.000000,
|
| 353 |
+
'GUYH850101': 0.520000,
|
| 354 |
+
'HOPT810101': 0.300000,
|
| 355 |
+
'JANJ780101': 42.000000,
|
| 356 |
+
'KARP850101': 1.169000,
|
| 357 |
+
'KYTJ820101': -0.800000,
|
| 358 |
+
'ROSM880101': 4.350000,
|
| 359 |
+
'VINM940101': 1.046000,
|
| 360 |
+
'WERD780101': 0.490000,
|
| 361 |
+
'ZIMJ680101': 0.140000,
|
| 362 |
+
'ZIMJ680104': 5.680000,
|
| 363 |
+
},
|
| 364 |
+
'T': {
|
| 365 |
+
'BIGC670101': 71.200000,
|
| 366 |
+
'CHAM820101': 0.108000,
|
| 367 |
+
'CHOP780201': 0.830000,
|
| 368 |
+
'CHOP780202': 1.190000,
|
| 369 |
+
'CHOP780203': 0.980000,
|
| 370 |
+
'EISD860101': 0.520000,
|
| 371 |
+
'FASG760101': 119.120000,
|
| 372 |
+
'FAUJ830101': 0.260000,
|
| 373 |
+
'GRAR740102': 8.600000,
|
| 374 |
+
'GRAR740103': 61.000000,
|
| 375 |
+
'GUYH850101': 0.070000,
|
| 376 |
+
'HOPT810101': -0.400000,
|
| 377 |
+
'JANJ780101': 45.000000,
|
| 378 |
+
'KARP850101': 1.073000,
|
| 379 |
+
'KYTJ820101': -0.700000,
|
| 380 |
+
'ROSM880101': 3.860000,
|
| 381 |
+
'VINM940101': 0.997000,
|
| 382 |
+
'WERD780101': 0.380000,
|
| 383 |
+
'ZIMJ680101': 0.540000,
|
| 384 |
+
'ZIMJ680104': 5.660000,
|
| 385 |
+
},
|
| 386 |
+
'W': {
|
| 387 |
+
'BIGC670101': 135.400000,
|
| 388 |
+
'CHAM820101': 0.409000,
|
| 389 |
+
'CHOP780201': 1.080000,
|
| 390 |
+
'CHOP780202': 1.370000,
|
| 391 |
+
'CHOP780203': 0.600000,
|
| 392 |
+
'EISD860101': 2.600000,
|
| 393 |
+
'FASG760101': 204.240000,
|
| 394 |
+
'FAUJ830101': 2.250000,
|
| 395 |
+
'GRAR740102': 5.400000,
|
| 396 |
+
'GRAR740103': 170.000000,
|
| 397 |
+
'GUYH850101': -0.510000,
|
| 398 |
+
'HOPT810101': -3.400000,
|
| 399 |
+
'JANJ780101': 34.700000,
|
| 400 |
+
'KARP850101': 0.925000,
|
| 401 |
+
'KYTJ820101': -0.900000,
|
| 402 |
+
'ROSM880101': -2.860000,
|
| 403 |
+
'VINM940101': 0.904000,
|
| 404 |
+
'WERD780101': 0.860000,
|
| 405 |
+
'ZIMJ680101': 0.310000,
|
| 406 |
+
'ZIMJ680104': 5.890000,
|
| 407 |
+
},
|
| 408 |
+
'Y': {
|
| 409 |
+
'BIGC670101': 116.200000,
|
| 410 |
+
'CHAM820101': 0.298000,
|
| 411 |
+
'CHOP780201': 0.690000,
|
| 412 |
+
'CHOP780202': 1.470000,
|
| 413 |
+
'CHOP780203': 1.140000,
|
| 414 |
+
'EISD860101': 1.600000,
|
| 415 |
+
'FASG760101': 181.190000,
|
| 416 |
+
'FAUJ830101': 0.960000,
|
| 417 |
+
'GRAR740102': 6.200000,
|
| 418 |
+
'GRAR740103': 136.000000,
|
| 419 |
+
'GUYH850101': -0.210000,
|
| 420 |
+
'HOPT810101': -2.300000,
|
| 421 |
+
'JANJ780101': 55.200000,
|
| 422 |
+
'KARP850101': 0.961000,
|
| 423 |
+
'KYTJ820101': -1.300000,
|
| 424 |
+
'ROSM880101': 0.980000,
|
| 425 |
+
'VINM940101': 0.929000,
|
| 426 |
+
'WERD780101': 0.640000,
|
| 427 |
+
'ZIMJ680101': 2.970000,
|
| 428 |
+
'ZIMJ680104': 5.660000,
|
| 429 |
+
},
|
| 430 |
+
'V': {
|
| 431 |
+
'BIGC670101': 85.100000,
|
| 432 |
+
'CHAM820101': 0.140000,
|
| 433 |
+
'CHOP780201': 1.060000,
|
| 434 |
+
'CHOP780202': 1.700000,
|
| 435 |
+
'CHOP780203': 0.590000,
|
| 436 |
+
'EISD860101': 1.500000,
|
| 437 |
+
'FASG760101': 117.150000,
|
| 438 |
+
'FAUJ830101': 1.220000,
|
| 439 |
+
'GRAR740102': 5.900000,
|
| 440 |
+
'GRAR740103': 84.000000,
|
| 441 |
+
'GUYH850101': -1.270000,
|
| 442 |
+
'HOPT810101': -1.500000,
|
| 443 |
+
'JANJ780101': 23.700000,
|
| 444 |
+
'KARP850101': 0.982000,
|
| 445 |
+
'KYTJ820101': 4.200000,
|
| 446 |
+
'ROSM880101': -2.180000,
|
| 447 |
+
'VINM940101': 0.931000,
|
| 448 |
+
'WERD780101': 0.720000,
|
| 449 |
+
'ZIMJ680101': 1.790000,
|
| 450 |
+
'ZIMJ680104': 5.960000,
|
| 451 |
+
},
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
# Feature descriptions
|
| 455 |
+
FEATURE_DESCRIPTIONS = {
|
| 456 |
+
'BIGC670101': 'Residue volume (Bigelow, 1967)',
|
| 457 |
+
'CHAM820101': 'Polarizability parameter (Charton-Charton, 1982)',
|
| 458 |
+
'CHOP780201': 'Normalized frequency of alpha-helix (Chou-Fasman, 1978b)',
|
| 459 |
+
'CHOP780202': 'Normalized frequency of beta-sheet (Chou-Fasman, 1978b)',
|
| 460 |
+
'CHOP780203': 'Normalized frequency of beta-turn (Chou-Fasman, 1978b)',
|
| 461 |
+
'EISD860101': 'Solvation free energy (Eisenberg-McLachlan, 1986)',
|
| 462 |
+
'FASG760101': 'Molecular weight (Fasman, 1976)',
|
| 463 |
+
'FAUJ830101': 'Hydrophobic parameter pi (Fauchere-Pliska, 1983)',
|
| 464 |
+
'GRAR740102': 'Polarity (Grantham, 1974)',
|
| 465 |
+
'GRAR740103': 'Volume (Grantham, 1974)',
|
| 466 |
+
'GUYH850101': 'Partition energy (Guy, 1985)',
|
| 467 |
+
'HOPT810101': 'Hydrophilicity value (Hopp-Woods, 1981)',
|
| 468 |
+
'JANJ780101': 'Average accessible surface area (Janin et al., 1978)',
|
| 469 |
+
'KARP850101': 'Flexibility parameter for no rigid neighbors (Karplus-Schulz, 1985)',
|
| 470 |
+
'KYTJ820101': 'Hydropathy index (Kyte-Doolittle, 1982)',
|
| 471 |
+
'ROSM880101': 'Side chain hydropathy, uncorrected for solvation (Roseman, 1988)',
|
| 472 |
+
'VINM940101': 'Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)',
|
| 473 |
+
'WERD780101': 'Propensity to be buried inside (Wertz-Scheraga, 1978)',
|
| 474 |
+
'ZIMJ680101': 'Hydrophobicity (Zimmerman et al., 1968)',
|
| 475 |
+
'ZIMJ680104': 'Isoelectric point (Zimmerman et al., 1968)',
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
# Convert to numpy array
|
| 479 |
+
def get_feature_vector(aa, feature_list=None):
|
| 480 |
+
"""
|
| 481 |
+
Get feature vector for an amino acid
|
| 482 |
+
|
| 483 |
+
Args:
|
| 484 |
+
aa: Amino acid single letter code
|
| 485 |
+
feature_list: List of feature codes to include (None = all)
|
| 486 |
+
Returns:
|
| 487 |
+
numpy array of features
|
| 488 |
+
"""
|
| 489 |
+
if aa not in AA_PROPERTIES_AAINDEX:
|
| 490 |
+
aa = "A" # Default to Alanine
|
| 491 |
+
|
| 492 |
+
props = AA_PROPERTIES_AAINDEX[aa]
|
| 493 |
+
|
| 494 |
+
if feature_list is None:
|
| 495 |
+
feature_list = sorted(props.keys())
|
| 496 |
+
|
| 497 |
+
return np.array([props[f] for f in feature_list])
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def get_sequence_features(sequence, feature_list=None):
|
| 501 |
+
"""Get feature matrix for a sequence [L, N_features]"""
|
| 502 |
+
return np.array([get_feature_vector(aa, feature_list) for aa in sequence])
|
| 503 |
+
# Test
|
| 504 |
+
if __name__ == "__main__":
|
| 505 |
+
print("Loaded 20 features for 20 amino acids")
|
| 506 |
+
print("\nExample: Alanine (A)")
|
| 507 |
+
for key, value in list(AA_PROPERTIES_AAINDEX["A"].items())[:5]:
|
| 508 |
+
print(f" {key}: {value:.4f} - {FEATURE_DESCRIPTIONS[key][:50]}")
|
| 509 |
+
|
| 510 |
+
print("\nTest sequence features:")
|
| 511 |
+
seq = "ARNDCQEG"
|
| 512 |
+
features = get_sequence_features(seq)
|
| 513 |
+
print(f" Sequence: {seq}")
|
| 514 |
+
print(f" Feature matrix shape: {features.shape}")
|
src/library/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
src/library/hla_library/A_prot.fasta
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:283b031c980e3e64fb3985da4012c9682bb6cbe1bef03ef85035b833a20c24b3
|
| 3 |
+
size 1350496
|
src/library/hla_library/B_prot.fasta
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e5b7563254487f8ed55746a1fe90638b337fdb2a145923919208b511f898fbb
|
| 3 |
+
size 1623286
|
src/library/hla_library/C_prot.fasta
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cecd489f0feca57e06c3dff8dd2e92f1142dfecae6b3b81cc7299b16e49567ed
|
| 3 |
+
size 1313293
|
src/library/hla_library/E_prot.fasta
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5eab5e5d1ebb1d9d50c5893b52a713390ff2121e1fef184ca0639f4cabff1e14
|
| 3 |
+
size 9590
|
src/library/hla_prot.fasta
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6dba02678d244bf6b1c5eb421704d8319ffe7c21deb04c4bab0ba1fabd38e147
|
| 3 |
+
size 13753408
|
src/library/trajs_aa.tsv
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Species Gene Allele AccNum Functionality aa_seq
|
| 2 |
+
Homosap TRAJ1 TRAJ1*01 M94081 F YESITSQLQFGKGTRVSTSP
|
| 3 |
+
Homosap TRAJ10 TRAJ10*01 M94081 F ILTGGGNKLTFGTGTQLKVEL
|
| 4 |
+
Homosap TRAJ11 TRAJ11*01 M94081 F NSGYSTLTFGKGTMLLVSP
|
| 5 |
+
Homosap TRAJ12 TRAJ12*01 X02885 F MDSSYKLIFGSGTRLLVRP
|
| 6 |
+
Homosap TRAJ13 TRAJ13*01 M94081 F NSGGYQKVTFGIGTKLQVIP
|
| 7 |
+
Homosap TRAJ13 TRAJ13*02 AB258131 F NSGGYQKVTFGTGTKLQVIP
|
| 8 |
+
Homosap TRAJ14 TRAJ14*01 M94081 F IYSTFIFGSGTRLSVKP
|
| 9 |
+
Homosap TRAJ15 TRAJ15*01 X05775 F NQAGTALIFGKGTTLSVSS
|
| 10 |
+
Homosap TRAJ15 TRAJ15*02 M94081 F NQAGTALIFGKGTHLSVSS
|
| 11 |
+
Homosap TRAJ16 TRAJ16*01 M94081 F FSDGQKLLFARGTMLKVDL
|
| 12 |
+
Homosap TRAJ16 TRAJ16*02 IMGT000024 F FSDGQKLLFARGTMLKVDL
|
| 13 |
+
Homosap TRAJ17 TRAJ17*01 X05773 F IKAAGNKLTFGGGTRVLVKP
|
| 14 |
+
Homosap TRAJ18 TRAJ18*01 M94081 F DRGSTLGRLYFGRGTQLTVWP
|
| 15 |
+
Homosap TRAJ2 TRAJ2*01 M94081 F NTGGTIDKLTFGKGTHVFIIS
|
| 16 |
+
Homosap TRAJ20 TRAJ20*01 M94081 F SNDYKLSFGAGTTVTVRA
|
| 17 |
+
Homosap TRAJ21 TRAJ21*01 M94081 F YNFNKFYFGSGTKLNVKP
|
| 18 |
+
Homosap TRAJ22 TRAJ22*01 X02886 F SSGSARQLTFGSGTQLTVLP
|
| 19 |
+
Homosap TRAJ23 TRAJ23*01 M94081 F IYNQGGKLIFGQGTELSVKP
|
| 20 |
+
Homosap TRAJ23 TRAJ23*02 X58763 F IYNQGGKLIFGQGTELSVKP
|
| 21 |
+
Homosap TRAJ24 TRAJ24*01 X02887 F TTDSWGKFEFGAGTQVVVTP
|
| 22 |
+
Homosap TRAJ24 TRAJ24*02 M94081 F TTDSWGKLQFGAGTQVVVTP
|
| 23 |
+
Homosap TRAJ24 TRAJ24*03 IMGT000024 F TTDSWGKFQFGAGTQVVVTP
|
| 24 |
+
Homosap TRAJ25 TRAJ25*01 M94081 F XEGQGFSFIFGKGTRLLVKP
|
| 25 |
+
Homosap TRAJ26 TRAJ26*01 M94081 F DNYGQNFVFGPGTRLSVLP
|
| 26 |
+
Homosap TRAJ27 TRAJ27*01 M94081 F NTNAGKSTFGDGTTLTVKP
|
| 27 |
+
Homosap TRAJ28 TRAJ28*01 M94081 F YSGAGSYQLTFGKGTKLSVIP
|
| 28 |
+
Homosap TRAJ29 TRAJ29*01 M94081 F NSGNTPLVFGKGTRLSVIA
|
| 29 |
+
Homosap TRAJ3 TRAJ3*01 X02884 F GYSSASKIIFGSGTRLSIRP
|
| 30 |
+
Homosap TRAJ30 TRAJ30*01 M94081 F NRDDKIIFGKGTRLHILP
|
| 31 |
+
Homosap TRAJ31 TRAJ31*01 M94081 F NNNARLMFGDGTQLVVKP
|
| 32 |
+
Homosap TRAJ32 TRAJ32*01 M94081 F NYGGATNKLIFGTGTLLAVQP
|
| 33 |
+
Homosap TRAJ32 TRAJ32*02 IMGT000024 F NYGGATNKLIFGTGTLLAVQP
|
| 34 |
+
Homosap TRAJ33 TRAJ33*01 M94081 F DSNYQLIWGAGTKLIIKP
|
| 35 |
+
Homosap TRAJ34 TRAJ34*01 M35622 F SYNTDKLIFGTGTRLQVFP
|
| 36 |
+
Homosap TRAJ35 TRAJ35*01 M94081 F IGFGNVLHCGSGTQVIVLP
|
| 37 |
+
Homosap TRAJ36 TRAJ36*01 M94081 F QTGANNLFFGTGTRLTVIP
|
| 38 |
+
Homosap TRAJ37 TRAJ37*01 M94081 F GSGNTGKLIFGQGTTLQVKP
|
| 39 |
+
Homosap TRAJ37 TRAJ37*02 IMGT000024 F GSSNTGKLIFGQGTTLQVKP
|
| 40 |
+
Homosap TRAJ38 TRAJ38*01 M94081 F NAGNNRKLIWGLGTSLAVNP
|
| 41 |
+
Homosap TRAJ39 TRAJ39*01 M94081 F NNNAGNMLTFGGGTRLMVKP
|
| 42 |
+
Homosap TRAJ4 TRAJ4*01 M94081 F FSGGYNKLIFGAGTRLAVHP
|
| 43 |
+
Homosap TRAJ40 TRAJ40*01 M35620 F TTSGTYKYIFGTGTRLKVLA
|
| 44 |
+
Homosap TRAJ41 TRAJ41*01 M94081 F NSNSGYALNFGKGTSLLVTP
|
| 45 |
+
Homosap TRAJ42 TRAJ42*01 M94081 F NYGGSQGNLIFGKGTKLSVKP
|
| 46 |
+
Homosap TRAJ43 TRAJ43*01 M94081 F NNNDMRFGAGTRLTVKP
|
| 47 |
+
Homosap TRAJ44 TRAJ44*01 M35619 F NTGTASKLTFGTGTRLQVTL
|
| 48 |
+
Homosap TRAJ45 TRAJ45*01 M94081 F YSGGGADGLTFGKGTHLIIQP
|
| 49 |
+
Homosap TRAJ46 TRAJ46*01 M94081 F KKSSGDKLTFGTGTRLAVRP
|
| 50 |
+
Homosap TRAJ47 TRAJ47*01 M94081 F EYGNKLVFGAGTILRVKS
|
| 51 |
+
Homosap TRAJ47 TRAJ47*02 AF033825 (F) EYGNKLVFGAGTILRVKS
|
| 52 |
+
Homosap TRAJ48 TRAJ48*01 M94081 F SNFGNEKLTFGTGTRLTIIP
|
| 53 |
+
Homosap TRAJ49 TRAJ49*01 M94081 F NTGNQFYFGTGTSLTVIP
|
| 54 |
+
Homosap TRAJ5 TRAJ5*01 M94081 F DTGRRALTFGSGTRLQVQP
|
| 55 |
+
Homosap TRAJ50 TRAJ50*01 M94081 F KTSYDKVIFGPGTSLSVIP
|
| 56 |
+
Homosap TRAJ52 TRAJ52*01 M94081 F NAGGTSYGKLTFGQGTILTVHP
|
| 57 |
+
Homosap TRAJ53 TRAJ53*01 M94081 F NSGGSNYKLTFGKGTLLTVNP
|
| 58 |
+
Homosap TRAJ54 TRAJ54*01 M94081 F IQGAQKLVFGQGTRLTINP
|
| 59 |
+
Homosap TRAJ56 TRAJ56*01 M94081 F YTGANSKLTFGKGITLSVRP
|
| 60 |
+
Homosap TRAJ57 TRAJ57*01 M94081 F TQGGSEKLVFGKGTKLTVNP
|
| 61 |
+
Homosap TRAJ58 TRAJ58*01 M94081 F ETSGSRLTFGEGTQLTVNP
|
| 62 |
+
Homosap TRAJ6 TRAJ6*01 M16747 F ASGGSYIPTFGRGTSLIVHP
|
| 63 |
+
Homosap TRAJ7 TRAJ7*01 M94081 F DYGNNRLAFGKGNQVVVIP
|
| 64 |
+
Homosap TRAJ8 TRAJ8*01 M94081 F NTGFQKLVFGTGTRLLVSP
|
| 65 |
+
Homosap TRAJ9 TRAJ9*01 M94081 F GNTGGFKTIFGAGTRLFVKA
|
src/library/trajs_nt.tsv
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Species Gene Allele AccNum Functionality nt_seq
|
| 2 |
+
Homosap TRAJ10 TRAJ10*01 M94081 F atactcacgggaggaggaaacaaactcacctttgggacaggcactcagctaaaagtggaactca
|
| 3 |
+
Homosap TRAJ11 TRAJ11*01 M94081 F tgaattcaggatacagcaccctcacctttgggaaggggactatgcttctagtctctccag
|
| 4 |
+
Homosap TRAJ12 TRAJ12*01 X02885 F ggatggatagcagctataaattgatcttcgggagtgggaccagactgctggtcaggcctg
|
| 5 |
+
Homosap TRAJ13 TRAJ13*01 M94081 F tgaattctgggggttaccagaaagttacctttggaattggaacaaagctccaagtcatcccaa
|
| 6 |
+
Homosap TRAJ13 TRAJ13*02 AB258131 F tgaattctgggggttaccagaaagttacctttggaactggaacaaagctccaagtcatcccaa
|
| 7 |
+
Homosap TRAJ14 TRAJ14*01 M94081 F atttatagcacattcatctttgggagtgggacaagattatcagtaaaacctg
|
| 8 |
+
Homosap TRAJ15 TRAJ15*01 X05775 F ccaaccaggcaggaactgctctgatctttgggaagggaaccaccttatcagtgagttcca
|
| 9 |
+
Homosap TRAJ15 TRAJ15*02 M94081 F ccaaccaggcaggaactgctctgatctttgggaagggaacccacctatcagtgagttcca
|
| 10 |
+
Homosap TRAJ16 TRAJ16*01 M94081 F ggttttcagatggccagaagctgctctttgcaaggggaaccatgttaaaggtggatctta
|
| 11 |
+
Homosap TRAJ16 TRAJ16*02 IMGT000024 F ggttttcagatggccagaagctgctctttgcaagggggaccatgttaaaggtggatctta
|
| 12 |
+
Homosap TRAJ17 TRAJ17*01 X05773 F tgatcaaagctgcaggcaacaagctaacttttggaggaggaaccagggtgctagttaaaccaa
|
| 13 |
+
Homosap TRAJ18 TRAJ18*01 M94081 F ccgacagaggctcaaccctggggaggctatactttggaagaggaactcagttgactgtctggcctg
|
| 14 |
+
Homosap TRAJ20 TRAJ20*01 M94081 F gttctaacgactacaagctcagctttggagccggaaccacagtaactgtaagagcaa
|
| 15 |
+
Homosap TRAJ21 TRAJ21*01 M94081 F tacaacttcaacaaattttactttggatctgggaccaaactcaatgtaaaaccaa
|
| 16 |
+
Homosap TRAJ22 TRAJ22*01 X02886 F tttcttctggttctgcaaggcaactgacctttggatctgggacacaattgactgttttacctg
|
| 17 |
+
Homosap TRAJ23 TRAJ23*01 M94081 F tgatttataaccagggaggaaagcttatcttcggacagggaacggagttatctgtgaaaccca
|
| 18 |
+
Homosap TRAJ23 TRAJ23*02 X58763 F tgatttataaccagggaggaaagcttatcttcggacagggaacggagctatctgtgaaaccca
|
| 19 |
+
Homosap TRAJ24 TRAJ24*01 X02887 F tgacaactgacagctgggggaaattcgagtttggagcagggacccaggttgtggtcaccccag
|
| 20 |
+
Homosap TRAJ24 TRAJ24*02 M94081 F tgacaactgacagctgggggaaattgcagtttggagcagggacccaggttgtggtcaccccag
|
| 21 |
+
Homosap TRAJ24 TRAJ24*03 IMGT000024 F tgacaactgacagctgggggaaattccagtttggagcagggacccaggttgtggtcaccccag
|
| 22 |
+
Homosap TRAJ26 TRAJ26*01 M94081 F gggataactatggtcagaattttgtctttggtcccggaaccagattgtccgtgctgccct
|
| 23 |
+
Homosap TRAJ27 TRAJ27*01 M94081 F taacaccaatgcaggcaaatcaacctttggggatgggactacgctcactgtgaagccaa
|
| 24 |
+
Homosap TRAJ28 TRAJ28*01 M94081 F catactctggggctgggagttaccaactcactttcgggaaggggaccaaactctcggtcataccaa
|
| 25 |
+
Homosap TRAJ29 TRAJ29*01 M94081 F ggaattcaggaaacacacctcttgtctttggaaagggcacaagactttctgtgattgcaa
|
| 26 |
+
Homosap TRAJ3 TRAJ3*01 X02884 F ggggtacagcagtgcttccaagataatctttggatcagggaccagactcagcatccggccaa
|
| 27 |
+
Homosap TRAJ30 TRAJ30*01 M94081 F tgaacagagatgacaagatcatctttggaaaagggacacgacttcatattctcccca
|
| 28 |
+
Homosap TRAJ31 TRAJ31*01 M94081 F ggaataacaatgccagactcatgtttggagatggaactcagctggtggtgaagccca
|
| 29 |
+
Homosap TRAJ32 TRAJ32*01 M94081 F tgaattatggcggtgctacaaacaagctcatctttggaactggcactctgcttgctgtccagccaa
|
| 30 |
+
Homosap TRAJ32 TRAJ32*02 IMGT000024 F tgaattatggtggtgctacaaacaagctcatctttggaactggcactctgcttgctgtccagccaa
|
| 31 |
+
Homosap TRAJ33 TRAJ33*01 M94081 F tggatagcaactatcagttaatctggggcgctgggaccaagctaattataaagccag
|
| 32 |
+
Homosap TRAJ34 TRAJ34*01 M35622 F tcttataacaccgacaagctcatctttgggactgggaccagattacaagtctttccaa
|
| 33 |
+
Homosap TRAJ35 TRAJ35*01 M94081 F gataggctttgggaatgtgctgcattgcgggtccggcactcaagtgattgttttaccac
|
| 34 |
+
Homosap TRAJ36 TRAJ36*01 M94081 F tcaaactggggcaaacaacctcttctttgggactggaacgagactcaccgttattccct
|
| 35 |
+
Homosap TRAJ37 TRAJ37*01 M94081 F tggctctggcaacacaggcaaactaatctttgggcaagggacaactttacaagtaaaaccag
|
| 36 |
+
Homosap TRAJ37 TRAJ37*02 IMGT000024 F tggctctagcaacacaggcaaactaatctttgggcaagggacaactttacaagtaaaaccag
|
| 37 |
+
Homosap TRAJ38 TRAJ38*01 M94081 F taatgctggcaacaaccgtaagctgatttggggattgggaacaagcctggcagtaaatccga
|
| 38 |
+
Homosap TRAJ39 TRAJ39*01 M94081 F tgaataataatgcaggcaacatgctcacctttggagggggaacaaggttaatggtcaaacccc
|
| 39 |
+
Homosap TRAJ4 TRAJ4*01 M94081 F tgttttctggtggctacaataagctgatttttggagcagggaccaggctggctgtacacccat
|
| 40 |
+
Homosap TRAJ40 TRAJ40*01 M35620 F actacctcaggaacctacaaatacatctttggaacaggcaccaggctgaaggttttagcaa
|
| 41 |
+
Homosap TRAJ41 TRAJ41*01 M94081 F gaactcaaattccgggtatgcactcaacttcggcaaaggcacctcgctgttggtcacacccc
|
| 42 |
+
Homosap TRAJ42 TRAJ42*01 M94081 F tgaattatggaggaagccaaggaaatctcatctttggaaaaggcactaaactctctgttaaaccaa
|
| 43 |
+
Homosap TRAJ43 TRAJ43*01 M94081 F acaataacaatgacatgcgctttggagcagggaccagactgacagtaaaaccaa
|
| 44 |
+
Homosap TRAJ44 TRAJ44*01 M35619 F taaataccggcactgccagtaaactcacctttgggactggaacaagacttcaggtcacgctcg
|
| 45 |
+
Homosap TRAJ45 TRAJ45*01 M94081 F tgtattcaggaggaggtgctgacggactcacctttggcaaagggactcatctaatcatccagccct
|
| 46 |
+
Homosap TRAJ46 TRAJ46*01 M94081 F agaagaaaagcagcggagacaagctgacttttgggaccgggactcgtttagcagttaggccca
|
| 47 |
+
Homosap TRAJ47 TRAJ47*01 M94081 F tggaatatggaaacaaactggtctttggcgcaggaaccattctgagagtcaagtcct
|
| 48 |
+
Homosap TRAJ47 TRAJ47*02 AF033825 (F) tggaatatggaaacaagctggtctttggcgcaggaaccattctgagagtcaagtcct
|
| 49 |
+
Homosap TRAJ48 TRAJ48*01 M94081 F tatctaactttggaaatgagaaattaacctttgggactggaacaagactcaccatcataccca
|
| 50 |
+
Homosap TRAJ49 TRAJ49*01 M94081 F gaacaccggtaaccagttctattttgggacagggacaagtttgacggtcattccaa
|
| 51 |
+
Homosap TRAJ5 TRAJ5*01 M94081 F tggacacgggcaggagagcacttacttttgggagtggaacaagactccaagtgcaaccaa
|
| 52 |
+
Homosap TRAJ50 TRAJ50*01 M94081 F tgaaaacctcctacgacaaggtgatatttgggccagggacaagcttatcagtcattccaa
|
| 53 |
+
Homosap TRAJ52 TRAJ52*01 M94081 F ctaatgctggtggtactagctatggaaagctgacatttggacaagggaccatcttgactgtccatccaa
|
| 54 |
+
Homosap TRAJ53 TRAJ53*01 M94081 F agaatagtggaggtagcaactataaactgacatttggaaaaggaactctcttaaccgtgaatccaa
|
| 55 |
+
Homosap TRAJ54 TRAJ54*01 M94081 F taattcagggagcccagaagctggtatttggccaaggaaccaggctgactatcaacccaa
|
| 56 |
+
Homosap TRAJ56 TRAJ56*01 M94081 F ttatactggagccaatagtaagctgacatttggaaaaggaataactctgagtgttagaccag
|
| 57 |
+
Homosap TRAJ57 TRAJ57*01 M94081 F taactcagggcggatctgaaaagctggtctttggaaagggaacgaaactgacagtaaacccat
|
| 58 |
+
Homosap TRAJ6 TRAJ6*01 M16747 F tgcatcaggaggaagctacatacctacatttggaagaggaaccagccttattgttcatccgt
|
| 59 |
+
Homosap TRAJ7 TRAJ7*01 M94081 F tgactatgggaacaacagactcgcttttgggaaggggaaccaagtggtggtcataccaa
|
| 60 |
+
Homosap TRAJ8 TRAJ8*01 M94081 F tgaacacaggctttcagaaacttgtatttggaactggcacccgacttctggtcagtccaa
|
| 61 |
+
Homosap TRAJ9 TRAJ9*01 M94081 F ggaaatactggaggcttcaaaactatctttggagcaggaacaagactatttgttaaagcaa
|
src/library/travs_aa.tsv
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Species Gene Allele AccNum Functionality aa_seq
|
| 2 |
+
Homosap TRAV1-1 TRAV1-1*01 AE000658 F GQSLEQPSEVTAVEGAIVQINCTYQTSGFYGLSWYQQHDGGAPTFLSYNALDGLEETGRFSSFLSRSDSYGYLLLQELQMKDSASYFCAVR
|
| 3 |
+
Homosap TRAV1-1 TRAV1-1*02 X04939 (F) GQSLEQPSEVTAVEGAIVQINCTYQTSGFYGLSWYQQHDGGAPTFLSYNGLDGLEETGRFSSFLSRSDSYGYLLLQELQMKDSASYFCA
|
| 4 |
+
Homosap TRAV1-2 TRAV1-2*01 AE000658 F GQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAPTFLSYNVLDGLEEKGRFSSFLSRSKGYSYLLLKELQMKDSASYLCAVR
|
| 5 |
+
Homosap TRAV1-2 TRAV1-2*02 U32544 [F] GQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAPTFLSYNVLDGLEEKG
|
| 6 |
+
Homosap TRAV1-2 TRAV1-2*03 IMGT000024 F GQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAPTFLSYNVLDGLEEKGRFSSFLSRSKGYSYLLLKELQMKDSASYLCAVR
|
| 7 |
+
Homosap TRAV10 TRAV10*01 AE000659 F KNQVEQSPQSLIILEGKNCTLQCNYTVSPFSNLRWYKQDTGRGPVSLTIMTFSENTKSNGRYTATLDADTKQSSLHITASQLSDSASYICVVS
|
| 8 |
+
Homosap TRAV10 TRAV10*02 IMGT000024 F KNQVEQSPQSLIILEGKNCTLQCNYTVSPFSNLRWYKQDTGRGPVSLTIMTFSENTKSNGRYTATLDADTKQSSLHITASQLSDSASYICVVS
|
| 9 |
+
Homosap TRAV12-1 TRAV12-1*01 AE000659 F RKEVEQDPGPFNVPEGATVAFNCTYSNSASQSFFWYRQDCRKEPKLLMSVYSSGNEDGRFTAQLNRASQYISLLIRDSKLSDSATYLCVVN
|
| 10 |
+
Homosap TRAV12-1 TRAV12-1*02 M17657 (F) RKEVEQDPGPFNVPEGATVAFNCTYSNSASQSFFWYRQDCRKEPKLLMSVYSSGNEDGRFTAHVNRASQYISLLIRDSKLSDSATYLCVVN
|
| 11 |
+
Homosap TRAV12-2 TRAV12-2*01 AE000659 F QKEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELIMFIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAVN
|
| 12 |
+
Homosap TRAV12-2 TRAV12-2*02 M81774 (F) QKEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELIMSIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAV
|
| 13 |
+
Homosap TRAV12-2 TRAV12-2*03 X04946 (F) GPLSVPEGAIASLNCTYSDRVSQSFFWYRQYSGKSPELIMSIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAVN
|
| 14 |
+
Homosap TRAV12-3 TRAV12-3*01 AE000659 F QKEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQYSRKGPELLMYTYSSGNKEDGRFTAQVDKSSKYISLFIRDSQPSDSATYLCAMS
|
| 15 |
+
Homosap TRAV12-3 TRAV12-3*02 M17656 (F) QKEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQYSRIGPELLMYTYSSGNKEDGRFTAQVDKSSKYISLFIRDSQPSDSATYLCAMS
|
| 16 |
+
Homosap TRAV13-1 TRAV13-1*01 AE000659 F GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKGPQLIIDIRSNVGEKKDQRIAVTLNKTAKHFSLHITETQPEDSAVYFCAAS
|
| 17 |
+
Homosap TRAV13-1 TRAV13-1*02 X04954 (F) GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKRPQLIIDIRSNVGEKKDQRIAVTLNKTAKHFSLHITETQPEDSAVYFCAAS
|
| 18 |
+
Homosap TRAV13-1 TRAV13-1*03 L11162 [F] GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKRPQLIIDIRSNVGEKKDQRIAVTLNKTAKHFSLQIT
|
| 19 |
+
Homosap TRAV13-2 TRAV13-2*01 AE000659 F GESVGLHLPTLSVQEGDNSIINCAYSNSASDYFIWYKQESGKGPQFIIDIRSNMDKRQGQRVTVLLNKTVKHLSLQIAATQPGDSAVYFCAEN
|
| 20 |
+
Homosap TRAV13-2 TRAV13-2*02 M17658 (F) GESVGLHLPTLSVQEGDNSIINCAYSNSASDYFIWYKQESGKGPQFIIDIRSNMDKRQGQRVTVLLNKTVKHLSLQIAATQPGDSAVYFCAE
|
| 21 |
+
Homosap TRAV14/DV4 TRAV14/DV4*01 M21626 F AQKITQTQPGMFVQEKEAVTLDCTYDTSDPSYGLFWYKQPSSGEMIFLIYQGSYDQQNATEGRYSLNFQKARKSANLVISASQLGDSAMYFCAMRE
|
| 22 |
+
Homosap TRAV14/DV4 TRAV14/DV4*02 AE000659 F AQKITQTQPGMFVQEKEAVTLDCTYDTSDQSYGLFWYKQPSSGEMIFLIYQGSYDEQNATEGRYSLNFQKARKSANLVISASQLGDSAMYFCAMRE
|
| 23 |
+
Homosap TRAV14/DV4 TRAV14/DV4*03 M21624 (F) AQKITQTQPGMFVQEKEAVTLDCTYDTSDPSYGLFWYKQPSSGEMIFLIYQGSYDQQNATEGRYSLNFQKARKSANLVISASQLGDSAMYFCAM
|
| 24 |
+
Homosap TRAV14/DV4 TRAV14/DV4*04 L09758 [F] QKITQTQPGMFVQEKEAVTLDCTYDTSDQSYGLFWYKQPSSGEMIFLIYQGSYDEQNATEGRYSLNFQKARKSANLVISASQLGDSAMYF
|
| 25 |
+
Homosap TRAV16 TRAV16*01 AE000659 F AQRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSRQRLQLLLRHISRESIKGFTADLNKGETSFHLKKPFAQEEDSAMYYCALS
|
| 26 |
+
Homosap TRAV17 TRAV17*01 AE000660 F SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHLILIRSNEREKHSGRLRVTLDTSKKSSSLLITASRAADTASYFCATD
|
| 27 |
+
Homosap TRAV18 TRAV18*01 AE000660 F GDSVTQTEGPVTLPERAALTLNCTYQSSYSTFLFWYVQYLNKEPELLLKSSENQETDSRGFQASPIKSDSSFHLEKPSVQLSDSAVYYCALR
|
| 28 |
+
Homosap TRAV19 TRAV19*01 AE000660 F AQKVTQAQTEISVVEKEDVTLDCVYETRDTTYYLFWYKQPPSGELVFLIRRNSFDEQNEISGRYSWNFQKSTSSFNFTITASQVVDSAVYFCALSE
|
| 29 |
+
Homosap TRAV2 TRAV2*01 AE000658 F KDQVFQPSTVASSEGAVVEIFCNHSVSNAYNFFWYLHFPGCAPRLLVKGSKPSQQGRYNMTYERFSSSLLILQVREADAAVYYCAVE
|
| 30 |
+
Homosap TRAV2 TRAV2*02 M17659 (F) KDQVFQPSTVASSEGAVVEIFCNHSVSNAYNFFWHLHFPGCAPRLLVKGSKPSQQGRYNMTYERFSSSLLILQVREADAAVYYCAVAW
|
| 31 |
+
Homosap TRAV20 TRAV20*01 AE000660 F EDQVTQSPEALRLQEGESSSLNCSYTVSGLRGLFWYRQDPGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLCAVQ
|
| 32 |
+
Homosap TRAV20 TRAV20*02 IMGT000024 F EDQVTQSPEALRLQEGESSSLNCSYTVSGLRGLFWYRQDPGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLCAVQ
|
| 33 |
+
Homosap TRAV20 TRAV20*03 S60789 (F) EDQVTQSPEALRLQEGESRSLNCSYTVSGLRGLFWYRQDPGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLC
|
| 34 |
+
Homosap TRAV20 TRAV20*04 X70305 (F) EDQVTQSPEALRLQEGESSSLNCSCTVSGLRGLFWYRQDPGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLCA
|
| 35 |
+
Homosap TRAV21 TRAV21*01 AE000660 F KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTSLLLIQSSQREQTSGRLNASLDKSSGRSTLYIAASQPGDSATYLCAVR
|
| 36 |
+
Homosap TRAV21 TRAV21*02 X58736 (F) KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTSLLLIQSSQREQTSGRLNASLDKSSGRSTLYIAASQPGDSATYLCA
|
| 37 |
+
Homosap TRAV22 TRAV22*01 AE000660 F GIQVEQSPPDLILQEGANSTLRCNFSDSVNNLQWFHQNPWGQLINLFYIPSGTKQNGRLSATTVATERYSLLYISSSQTTDSGVYFCAVE
|
| 38 |
+
Homosap TRAV23/DV6 TRAV23/DV6*01 AE000660 F QQQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSLHIMDSQPGDSATYFCAAS
|
| 39 |
+
Homosap TRAV23/DV6 TRAV23/DV6*02 M17660 (F) QQQVKQSPQSLIVQKGGIPIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSLHIMDSQPGDSATYFCAAS
|
| 40 |
+
Homosap TRAV23/DV6 TRAV23/DV6*03 M97704 (F) QQQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSLHIMDSQPGDSATYFCAAS
|
| 41 |
+
Homosap TRAV23/DV6 TRAV23/DV6*04 Y10411 [F] QQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSLHIMDSQPGDSATYFC
|
| 42 |
+
Homosap TRAV23/DV6 TRAV23/DV6*05 IMGT000024 F QQQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSSHIMDSQPGDSATYFCAAS
|
| 43 |
+
Homosap TRAV24 TRAV24*01 AE000660 F ILNVEQSPQSLHVQEGDSTNFTCSFPSSNFYALHWYRWETAKSPEALFVMTLNGDEKKKGRISATLNTKEGYSYLYIKGSQPEDSATYLCAF
|
| 44 |
+
Homosap TRAV24 TRAV24*02 M17661 (F) ILNVEQGPQSLHVQEGDSTNFTCSFPSSNFYALHWYRWETAKTPEALFVMTLNGDEKKKGRISATLNTKEGYSYLYIKGSQPEDSATYLCAF
|
| 45 |
+
Homosap TRAV25 TRAV25*01 AE000660 F GQQVMQIPQYQHVQEGEDFTTYCNSSTTLSNIQWYKQRPGGHPVFLIQLVKSGEVKKQKRLTFQFGEAKKNSSLHITATQTTDVGTYFCAG
|
| 46 |
+
Homosap TRAV26-1 TRAV26-1*01 AE000660 F DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCIVRV
|
| 47 |
+
Homosap TRAV26-1 TRAV26-1*02 IMGT000024 F DAKTTQPTSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCIVRV
|
| 48 |
+
Homosap TRAV26-1 TRAV26-1*03 L06886 (F) DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQNIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCI
|
| 49 |
+
Homosap TRAV26-2 TRAV26-2*01 AE000660 F DAKTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVIHGLTSNVNNRMASLAIAEDRKSSTLILHRATLRDAAVYYCILRD
|
| 50 |
+
Homosap TRAV26-2 TRAV26-2*02 L11160 [F] DAKTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVIHGLTSNVNNRMACVAIAEDRKSST
|
| 51 |
+
Homosap TRAV27 TRAV27*01 AE000660 F TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLLVTVVTGGEVKKLKRLTFQFGDARKDSSLHITAAQPGDTGLYLCAG
|
| 52 |
+
Homosap TRAV27 TRAV27*02 X04957 (F) TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLLVTVVTGGEVKKLKRLTFQFGDARKDSSLHITAAQPGDTGHYLCA
|
| 53 |
+
Homosap TRAV27 TRAV27*03 IMGT000024 F TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLLVTVVTGGEVKKLKRLTFQFGDARKDSSLHITAAQTGDTGLYLCAG
|
| 54 |
+
Homosap TRAV29/DV5 TRAV29/DV5*01 AE000660 F DQQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFLISISSIKDKNEDGRFTVFLNKSAKHLSLHIVPSQPGDSAVYFCAAS
|
| 55 |
+
Homosap TRAV29/DV5 TRAV29/DV5*02 S81645 F DQQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFLISISSIKDKNEDGRFTVFLNKSAKHLSLDIVPSQPGDSAVYFCAAS
|
| 56 |
+
Homosap TRAV29/DV5 TRAV29/DV5*04 IMGT000024 F DQQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFLISISSIKDKNEDGRFTVFLNKSAKHLSLHIVPSQPGDSAVYFCAAS
|
| 57 |
+
Homosap TRAV3 TRAV3*01 AE000658 F AQSVAQPEDQVNVAEGNPLTVKCTYSVSGNPYLFWYVQYPNRGLQFLLKYITGDNLVKGSYGFEAEFNKSQTSFHLKKPSALVSDSALYFCAVRD
|
| 58 |
+
Homosap TRAV30 TRAV30*01 AE000660 F QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQKGHEKISASFNEKKQQSSLYLTASQLSYSGTYFCGTE
|
| 59 |
+
Homosap TRAV30 TRAV30*02 X58768 (F) QQPVQSPQAVILREGEDAVTNCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQMRREKISASFNEKKQQSSLYLTASQLSYSGTYFCG
|
| 60 |
+
Homosap TRAV30 TRAV30*03 L06883 (F) QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQKGHEKISASFNEKKRQSSLYLTASQLSYSGTYFCG
|
| 61 |
+
Homosap TRAV30 TRAV30*04 U32537 [F] QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQKRHEKISASFNEKKQQSSLYLT
|
| 62 |
+
Homosap TRAV30 TRAV30*05 IMGT000024 F QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQKGHDKISASFNEKKQQSSLYLTASQLSYSGTYFCGTE
|
| 63 |
+
Homosap TRAV34 TRAV34*01 AE000660 F SQELEQSPQSLIVQEGKNLTINCTSSKTLYGLYWYKQKYGEGLIFLMMLQKGGEEKSHEKITAKLDEKKQQSSLHITASQPSHAGIYLCGAD
|
| 64 |
+
Homosap TRAV35 TRAV35*01 AE000660 F GQQLNQSPQSMFIQEGEDVSMNCTSSSIFNTWLWYKQEPGEGPVLLIALYKAGELTSNGRLTAQFGITRKDSFLNISASIPSDVGIYFCAGQ
|
| 65 |
+
Homosap TRAV35 TRAV35*02 X58738 (F) GQQLNQSPQSMFIQEGEDVSMNCTSSSIFNTWLWYKQDPGEGPVLLIALYKAGELTSNGRLTAQFGITRKDSFLNISASIPSDVGIYFCA
|
| 66 |
+
Homosap TRAV36/DV7 TRAV36/DV7*01 AE000660 F EDKVVQSPLSLVVHEGDTVTLNCSYEVTNFRSLLWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELSSILNITATQTGDSAIYLCAVE
|
| 67 |
+
Homosap TRAV36/DV7 TRAV36/DV7*02 X61070 (F) EDKVVQSPQSLVVHEGDTVTLNCSYEMTNFRSLQWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAVYLCAV
|
| 68 |
+
Homosap TRAV36/DV7 TRAV36/DV7*03 X58767 (F) EDKVVQSPLSLVVHEGDTVTPNCSYEVTNFRSLLWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAVYLCA
|
| 69 |
+
Homosap TRAV36/DV7 TRAV36/DV7*04 Z46643 (F) EDKVVQSPLSLVVHEGDTVTLNCSYEVTNFRSLLWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAVYLCA
|
| 70 |
+
Homosap TRAV36/DV7 TRAV36/DV7*05 IMGT000024 F EDKVVQSPLSLVVHEGDTVTLNCSYEVTNFRSLLWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAIYLCAVE
|
| 71 |
+
Homosap TRAV38-1 TRAV38-1*01 AE000661 F AQTVTQSQPEMSVQEAETVTLSCTYDTSENNYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCAFMK
|
| 72 |
+
Homosap TRAV38-1 TRAV38-1*02 M64355 (F) AQTVTQSQPEMSVQEAETVTLSCTYDTSENDYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCA
|
| 73 |
+
Homosap TRAV38-1 TRAV38-1*03 M95394 (F) AQTVTQSQPEMSVQEAETVTLSCTYDTSESNYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCAF
|
| 74 |
+
Homosap TRAV38-1 TRAV38-1*04 L06880 (F) AQTVTQSQPEMSVQEAETVTLSCTYDTSENNYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCA
|
| 75 |
+
Homosap TRAV38-2/DV8 TRAV38-2/DV8*01 AE000661 F AQTVTQSQPEMSVQEAETVTLSCTYDTSESDYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDAAMYFCAYRS
|
| 76 |
+
Homosap TRAV39 TRAV39*01 AE000661 F ELKVEQNPLFLSMQEGKNYTIYCNYSTTSDRLYWYRQDPGKSLESLFVLLSNGAVKQEGRLMASLDTKARLSTLHITAAVHDLSATYFCAVD
|
| 77 |
+
Homosap TRAV4 TRAV4*01 AE000658 F LAKTTQPISMDSYEGQEVNITCSHNNIATNDYITWYQQFPSQGPRFIIQGYKTKVTNEVASLFIPADRKSSTLSLPRVSLSDTAVYYCLVGD
|
| 78 |
+
Homosap TRAV40 TRAV40*01 X73521 F SNSVKQTGQITVSEGASVTMNCTYTSTGYPTLFWYVEYPSKPLQLLQRETMENSKNFGGGNIKDKNSPIVKYSVQVSDSAVYYCLLG
|
| 79 |
+
Homosap TRAV41 TRAV41*01 AE000661 F KNEVEQSPQNLTAQEGEFITINCSYSVGISALHWLQQHPGGGIVSLFMLSSGKKKHGRLIATINIQEKHSSLHITASHPRDSAVYICAVR
|
| 80 |
+
Homosap TRAV5 TRAV5*01 AE000659 F GEDVEQSLFLSVREGDSSVINCTYTDSSSTYLYWYKQEPGAGLQLLTYIFSNMDMKQDQRLTVLLNKKDKHLSLRIADTQTGDSAIYFCAES
|
| 81 |
+
Homosap TRAV6 TRAV6*01 AE000659 F SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCALD
|
| 82 |
+
Homosap TRAV6 TRAV6*02 X58747 (F) SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCA
|
| 83 |
+
Homosap TRAV6 TRAV6*03 Z49060 [F] EALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCA
|
| 84 |
+
Homosap TRAV6 TRAV6*04 Y10409 [F] EALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHVTASQPADSATYLCA
|
| 85 |
+
Homosap TRAV6 TRAV6*05 Y10410 [F] EALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCA
|
| 86 |
+
Homosap TRAV6 TRAV6*06 U32542 [F] SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLNQ
|
| 87 |
+
Homosap TRAV6 TRAV6*07 IMGT000024 F SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCALD
|
| 88 |
+
Homosap TRAV7 TRAV7*01 AE000659 F ENQVEHSPHFLGPQQGDVASMSCTYSVSRFNNLQWYRQNTGMGPKHLLSMYSAGYEKQKGRLNATLLKNGSSLYITAVQPEDSATYFCAVD
|
| 89 |
+
Homosap TRAV8-1 TRAV8-1*01 AE000659 F AQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQLLLKYFSGDPLVKGIKGFEAEFIKSKFSFNLRKPSVQWSDTAEYFCAVN
|
| 90 |
+
Homosap TRAV8-1 TRAV8-1*02 U32520 [F] AQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQLLLKYFSGDPLVKGIKGVEAEFIKSKFSFNLRKPSVQW
|
| 91 |
+
Homosap TRAV8-2 TRAV8-2*01 AE000659 F AQSVTQLDSHVSVSEGTPVLLRCNYSSSYSPSLFWYVQHPNKGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCVVS
|
| 92 |
+
Homosap TRAV8-2 TRAV8-2*02 M17650 (F) AQSVTQLSSHVSVSEGTPVLLRCNYSSSYSPSLFWYVQHPNKGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCVV
|
| 93 |
+
Homosap TRAV8-2 TRAV8-2*03 IMGT000024 F AQSVTQLDSHVSVSEGTPVLLRCNYSSSYSPSLFWYVQHPNKGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCVVS
|
| 94 |
+
Homosap TRAV8-3 TRAV8-3*01 AE000659 F AQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDAAEYFCAVG
|
| 95 |
+
Homosap TRAV8-3 TRAV8-3*02 M35617 (F) AQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDAAEYFCAVV
|
| 96 |
+
Homosap TRAV8-3 TRAV8-3*03 L06885 (F) AQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDASEYFCA
|
| 97 |
+
Homosap TRAV8-4 TRAV8-4*01 AE000659 F AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
|
| 98 |
+
Homosap TRAV8-4 TRAV8-4*02 M12423 (F) AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
|
| 99 |
+
Homosap TRAV8-4 TRAV8-4*03 D13077 (F) AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTTGATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCA
|
| 100 |
+
Homosap TRAV8-4 TRAV8-4*04 M12959 (F) AQSVTQLGSHVSVSERALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
|
| 101 |
+
Homosap TRAV8-4 TRAV8-4*05 X63455 (F) AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
|
| 102 |
+
Homosap TRAV8-4 TRAV8-4*06 K02777 (F) GATHYCCPPILFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPAAHMSDAAEYFCAVS
|
| 103 |
+
Homosap TRAV8-4 TRAV8-4*07 M17665 (F) VEPYLFWYVQYPNQGLQLLLKYTTGATLVKGINGFEAEFKKSETSFHLTKPSAHMTDPAEYFCAV
|
| 104 |
+
Homosap TRAV8-6 TRAV8-6*01 X02850 F AQSVTQLDSQVPVFEEAPVELRCNYSSSVSVYLFWYVQYPNQGLQLLLKYLSGSTLVESINGFEAEFNKSQTSFHLRKPSVHISDTAEYFCAVS
|
| 105 |
+
Homosap TRAV8-6 TRAV8-6*02 AE000659 F AQSVTQLDSQVPVFEEAPVELRCNYSSSVSVYLFWYVQYPNQGLQLLLKYLSGSTLVKGINGFEAEFNKSQTSFHLRKPSVHISDTAEYFCAVS
|
| 106 |
+
Homosap TRAV9-1 TRAV9-1*01 AE000659 F GDSVVQTEGQVLPSEGDSLIVNCSYETTQYPSLFWYVQYPGEGPQLHLKAMKANDKGRNKGFEAMYRKETTSFHLEKDSVQESDSAVYFCALS
|
| 107 |
+
Homosap TRAV9-2 TRAV9-2*01 AE000659 F GNSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPGEGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCALS
|
| 108 |
+
Homosap TRAV9-2 TRAV9-2*02 IMGT000024 F GDSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPGEGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCALS
|
| 109 |
+
Homosap TRAV9-2 TRAV9-2*03 L06881 (F) GDSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPGEGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCA
|
| 110 |
+
Homosap TRAV9-2 TRAV9-2*04 L06882 (F) GNSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPGEGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCA
|
src/library/travs_nt.tsv
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Species Gene Allele AccNum Functionality nt_seq
|
| 2 |
+
Homosap TRAV1-1 TRAV1-1*01 AE000658 F ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagataaactgcacgtaccagacatctgggttttatgggctgtcctggtaccagcaacatgatggcggagcacccacatttctttcttacaatgctctggatggtttggaggagacaggtcgtttttcttcattccttagtcgctctgatagttatggttacctccttctacaggagctccagatgaaagactctgcctcttacttctgcgctgtgagaga
|
| 3 |
+
Homosap TRAV1-1 TRAV1-1*02 X04939 (F) ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagataaactgcacgtaccagacatctgggttttatgggctgtcctggtaccagcaacatgatggcggagcacccacatttctttcttacaatggtctggatggtttggaggagacaggtcgtttttcttcattccttagtcgctctgatagttatggttacctccttctacaggagctccagatgaaagactctgcctcttacttctgcgctgt
|
| 4 |
+
Homosap TRAV1-2 TRAV1-2*01 AE000658 F ggacaaaacattgaccagcccactgagatgacagctacggaaggtgccattgtccagatcaactgcacgtaccagacatctgggttcaacgggctgttctggtaccagcaacatgctggcgaagcacccacatttctgtcttacaatgttctggatggtttggaggagaaaggtcgtttttcttcattccttagtcggtctaaagggtacagttacctccttttgaaggagctccagatgaaagactctgcctcttacctctgtgctgtgagaga
|
| 5 |
+
Homosap TRAV1-2 TRAV1-2*02 U32544 [F] ggacaaaacattgaccagcccactgagatgacagctacggaaggtgccattgtccagatcaactgcacgtaccagacatctgggttcaacgggctgttctggtaccagcaacatgctggcgaagcacccacatttctgtcttacaatgttctggatggtctggaggagaaaggtcg
|
| 6 |
+
Homosap TRAV1-2 TRAV1-2*03 IMGT000024 F ggacaaaacattgaccagcccactgagatgacagctacggaaggtgccattgtccagatcaactgcacgtaccagacatctgggttcaacgggctgttctggtaccagcaacatgctggcgaagcacctacatttctgtcttacaatgttctggatggtttggaggagaaaggtcgtttttcttcattccttagtcggtctaaagggtacagttacctccttttgaaggagctccagatgaaagactctgcctcttacctctgtgctgtgagaga
|
| 7 |
+
Homosap TRAV10 TRAV10*01 AE000659 F aaaaaccaagtggagcagagtcctcagtccctgatcatcctggagggaaagaactgcactcttcaatgcaattatacagtgagccccttcagcaacttaaggtggtataagcaagatactgggagaggtcctgtttccctgacaatcatgactttcagtgagaacacaaagtcgaacggaagatatacagcaactctggatgcagacacaaagcaaagctctctgcacatcacagcctcccagctcagcgattcagcctcctacatctgtgtggtgagcg
|
| 8 |
+
Homosap TRAV10 TRAV10*02 IMGT000024 F aaaaaccaagtggagcagagtcctcagtccctgatcatcctggagggaaagaactgcactcttcaatgcaattatacagtgagccccttcagcaacttaaggtggtataagcaagatacggggagaggtcctgtttccctgacaatcatgactttcagtgagaacacaaagtcgaacggaagatatacagcaactctggatgcagacacaaagcaaagctctctgcacatcacagcctcccagctcagcgattcagcctcctacatctgtgtggtgagcg
|
| 9 |
+
Homosap TRAV12-1 TRAV12-1*01 AE000659 F cggaaggaggtggagcaggatcctggacccttcaatgttccagagggagccactgtcgctttcaactgtacttacagcaacagtgcttctcagtctttcttctggtacagacaggattgcaggaaagaacctaagttgctgatgtccgtatactccagtggtaatgaagatggaaggtttacagcacagctcaatagagccagccagtatatttccctgctcatcagagactccaagctcagtgattcagccacctacctctgtgtggtgaaca
|
| 10 |
+
Homosap TRAV12-1 TRAV12-1*02 M17657 (F) cggaaggaggtggagcaggatcctggacccttcaatgttccagagggagccactgtcgctttcaactgtacttacagcaacagtgcttctcagtctttcttctggtacagacaggattgcaggaaagaacctaagttgctgatgtccgtatactccagtggtaatgaagatggaaggtttacagcacacgtcaatagagccagccagtatatttccctgctcatcagagactccaagctcagtgattcagccacctacctctgtgtggtgaaca
|
| 11 |
+
Homosap TRAV12-2 TRAV12-2*01 AE000659 F cagaaggaggtggagcagaattctggacccctcagtgttccagagggagccattgcctctctcaactgcacttacagtgaccgaggttcccagtccttcttctggtacagacaatattctgggaaaagccctgagttgataatgttcatatactccaatggtgacaaagaagatggaaggtttacagcacagctcaataaagccagccagtatgtttctctgctcatcagagactcccagcccagtgattcagccacctacctctgtgccgtgaaca
|
| 12 |
+
Homosap TRAV12-2 TRAV12-2*02 M81774 (F) cagaaggaggtggagcagaattctggacccctcagtgttccagagggagccattgcctctctcaactgcacttacagtgaccgaggttcccagtccttcttctggtacagacaatattctgggaaaagccctgagttgataatgtccatatactccaatggtgacaaagaagatggaaggtttacagcacagctcaataaagccagccagtatgtttctctgctcatcagagactcccagcccagtgattcagccacctacctctgtgccgtg
|
| 13 |
+
Homosap TRAV12-2 TRAV12-2*03 X04946 (F) ggacccctcagtgttccagagggagccattgcctctctcaactgcacttacagtgaccgagtttcccagtccttcttctggtacagacaatattctgggaaaagccctgagttgataatgtccatatactccaatggtgacaaagaagatggaaggtttacagcacagctcaataaagccagccagtatgtttctctgctcatcagagactcccagcccagtgattcagccacctacctctgtgccgtgaac
|
| 14 |
+
Homosap TRAV12-3 TRAV12-3*01 AE000659 F cagaaggaggtggagcaggatcctggaccactcagtgttccagagggagccattgtttctctcaactgcacttacagcaacagtgcttttcaatacttcatgtggtacagacagtattccagaaaaggccctgagttgctgatgtacacatactccagtggtaacaaagaagatggaaggtttacagcacaggtcgataaatccagcaagtatatctccttgttcatcagagactcacagcccagtgattcagccacctacctctgtgcaatgagcg
|
| 15 |
+
Homosap TRAV12-3 TRAV12-3*02 M17656 (F) cagaaggaggtggagcaggatcctggaccactcagtgttccagagggagccattgtttctctcaactgcacttacagcaacagtgcttttcaatacttcatgtggtacagacagtattccagaataggccctgagttgctgatgtacacatactccagtggtaacaaagaagatggaaggtttacagcacaggtcgataaatccagcaagtatatctccttgttcatcagagactcacagcccagtgattcagccacctacctctgtgcaatgagcg
|
| 16 |
+
Homosap TRAV13-1 TRAV13-1*01 AE000659 F ggagagaatgtggagcagcatccttcaaccctgagtgtccaggagggagacagcgctgttatcaagtgtacttattcagacagtgcctcaaactacttcccttggtataagcaagaacttggaaaaggacctcagcttattatagacattcgttcaaatgtgggcgaaaagaaagaccaacgaattgctgttacattgaacaagacagccaaacatttctccctgcacatcacagagacccaacctgaagactcggctgtctacttctgtgcagcaagta
|
| 17 |
+
Homosap TRAV13-1 TRAV13-1*02 X04954 (F) ggagagaatgtggagcagcatccttcaaccctgagtgtccaggagggagacagcgctgttatcaagtgtacttattcagacagtgcctcaaactacttcccttggtataagcaagaacttggaaaaagacctcagcttattatagacattcgttcaaatgtgggcgaaaagaaagaccaacgaattgctgttacattgaacaagacagccaaacatttctccctgcacatcacagagacccaacctgaagactcggctgtctacttctgtgcagcaagta
|
| 18 |
+
Homosap TRAV13-1 TRAV13-1*03 L11162 [F] ggagagaatgtggagcagcatccttcaaccctgagtgtccaggagggagacagcgctgttatcaagtgtacttattcagacagtgcctcaaactacttcccttggtataagcaagaacttggaaaaagacctcagcttattatagacattcgttcaaatgtgggcgaaaagaaagaccaacgaattgctgttacattgaacaagacagccaaacatttctccctgcagatcaca
|
| 19 |
+
Homosap TRAV13-2 TRAV13-2*01 AE000659 F ggagagagtgtggggctgcatcttcctaccctgagtgtccaggagggtgacaactctattatcaactgtgcttattcaaacagcgcctcagactacttcatttggtacaagcaagaatctggaaaaggtcctcaattcattatagacattcgttcaaatatggacaaaaggcaaggccaaagagtcaccgttttattgaataagacagtgaaacatctctctctgcaaattgcagctactcaacctggagactcagctgtctacttttgtgcagagaata
|
| 20 |
+
Homosap TRAV13-2 TRAV13-2*02 M17658 (F) ggagagagtgtggggctgcatcttcctaccctgagtgtccaggagggtgacaactctattatcaactgtgcttattcaaacagcgcctcagactacttcatttggtacaaacaagaatctggaaaaggtcctcaattcattatagacattcgttcaaatatggacaaaaggcaaggccaaagagtcaccgttttattgaataagacagtgaaacatctctctctgcaaattgcagctactcaacctggagactcagctgtctacttttgtgcagaga
|
| 21 |
+
Homosap TRAV14/DV4 TRAV14/DV4*01 M21626 F gcccagaagataactcaaacccaaccaggaatgttcgtgcaggaaaaggaggctgtgactctggactgcacatatgacaccagtgatccaagttatggtctattctggtacaagcagcccagcagtggggaaatgatttttcttatttatcaggggtcttatgaccagcaaaatgcaacagaaggtcgctactcattgaatttccagaaggcaagaaaatccgccaaccttgtcatctccgcttcacaactgggggactcagcaatgtacttctgtgcaatgagagaggg
|
| 22 |
+
Homosap TRAV14/DV4 TRAV14/DV4*02 AE000659 F gcccagaagataactcaaacccaaccaggaatgttcgtgcaggaaaaggaggctgtgactctggactgcacatatgacaccagtgatcaaagttatggtctattctggtacaagcagcccagcagtggggaaatgatttttcttatttatcaggggtcttatgacgagcaaaatgcaacagaaggtcgctactcattgaatttccagaaggcaagaaaatccgccaaccttgtcatctccgcttcacaactgggggactcagcaatgtatttctgtgcaatgagagaggg
|
| 23 |
+
Homosap TRAV14/DV4 TRAV14/DV4*03 M21624 (F) gcccagaagataactcaaacccaaccaggaatgttcgtgcaggaaaaggaggctgtgactctggactgcacatatgacaccagtgatccaagttatggtctattctggtacaagcagcccagcagtggggaaatgatttttcttatttatcaggggtcttatgaccagcaaaatgcaacagaaggtcgctactcattgaatttccagaaggcaagaaaatccgccaaccttgtcatctccgcttcacaactgggggactcagcaatgtatttctgtgcaatg
|
| 24 |
+
Homosap TRAV14/DV4 TRAV14/DV4*04 L09758 [F] cagaagataactcaaacccaaccaggaatgttcgtgcaggaaaaggaggctgtgactctggactgcacatatgacaccagtgatcaaagttatggtctcttctggtacaagcagcccagcagtggggaaatgatttttcttatttatcaggggtcttatgacgagcaaaatgcaacagaaggtcgctactcattgaatttccagaaggcaagaaaatccgccaaccttgtcatctccgcttcacaactgggggactcagcaatgtacttct
|
| 25 |
+
Homosap TRAV16 TRAV16*01 AE000659 F gcccagagagtgactcagcccgagaagctcctctctgtctttaaaggggccccagtggagctgaagtgcaactattcctattctgggagtcctgaactcttctggtatgtccagtactccagacaacgcctccagttactcttgagacacatctctagagagagcatcaaaggcttcactgctgaccttaacaaaggcgagacatctttccacctgaagaaaccatttgctcaagaggaagactcagccatgtattactgtgctctaagtgg
|
| 26 |
+
Homosap TRAV17 TRAV17*01 AE000660 F agtcaacagggagaagaggatcctcaggccttgagcatccaggagggtgaaaatgccaccatgaactgcagttacaaaactagtataaacaatttacagtggtatagacaaaattcaggtagaggccttgtccacctaattttaatacgttcaaatgaaagagagaaacacagtggaagattaagagtcacgcttgacacttccaagaaaagcagttccttgttgatcacggcttcccgggcagcagacactgcttcttacttctgtgctacggacg
|
| 27 |
+
Homosap TRAV18 TRAV18*01 AE000660 F ggagactcggttacccagacagaaggcccagttaccctccctgagagggcagctctgacattaaactgcacttatcagtccagctattcaacttttctattctggtatgtccagtatctaaacaaagagcctgagctcctcctgaaaagttcagaaaaccaggagacggacagcagaggttttcaggccagtcctatcaagagtgacagttccttccacctggagaagccctcggtgcagctgtcggactctgccgtgtactactgcgctctgagaga
|
| 28 |
+
Homosap TRAV19 TRAV19*01 AE000660 F gctcagaaggtaactcaagcgcagactgaaatttctgtggtggagaaggaggatgtgaccttggactgtgtgtatgaaacccgtgatactacttattacttattctggtacaagcaaccaccaagtggagaattggttttccttattcgtcggaactcttttgatgagcaaaatgaaataagtggtcggtattcttggaacttccagaaatccaccagttccttcaacttcaccatcacagcctcacaagtcgtggactcagcagtatacttctgtgctctgagtgaggc
|
| 29 |
+
Homosap TRAV2 TRAV2*01 AE000658 F aaggaccaagtgtttcagccttccacagtggcatcttcagagggagctgtggtggaaatcttctgtaatcactctgtgtccaatgcttacaacttcttctggtaccttcacttcccgggatgtgcaccaagactccttgttaaaggctcaaagccttctcagcagggacgatacaacatgacctatgaacggttctcttcatcgctgctcatcctccaggtgcgggaggcagatgctgctgtttactactgtgctgtggagga
|
| 30 |
+
Homosap TRAV2 TRAV2*02 M17659 (F) aaggaccaagtgtttcagccttccacagtggcatcttcagagggagctgtggtggaaatcttctgtaatcactctgtgtccaatgcttacaacttcttctggcaccttcacttcccgggatgtgcaccaagactccttgttaaaggctcaaagccttctcagcagggacgatacaacatgacctatgaacggttctcttcatcgctgctcatcctccaggtgcgggaggcagatgctgctgtttactactgtgctgtggcctgg
|
| 31 |
+
Homosap TRAV20 TRAV20*01 AE000660 F gaagaccaggtgacgcagagtcccgaggccctgagactccaggagggagagagtagcagtcttaactgcagttacacagtcagcggtttaagagggctgttctggtataggcaagatcctgggaaaggccctgaattcctcttcaccctgtattcagctggggaagaaaaggagaaagaaaggctaaaagccacattaacaaagaaggaaagctttctgcacatcacagcccctaaacctgaagactcagccacttatctctgtgctgtgcagg
|
| 32 |
+
Homosap TRAV20 TRAV20*02 IMGT000024 F gaagaccaggtgacgcagagtcccgaggccctgagactccaggagggagagagtagcagtctcaactgcagttacacagtcagcggtttaagagggctgttctggtataggcaagatcctgggaaaggccctgaattcctcttcaccctgtattcagctggggaagaaaaggagaaagaaaggctaaaagccacattaacaaagaaggaaagctttctgcacatcacagcccctaaacctgaagactcagccacttatctctgtgctgtgcagg
|
| 33 |
+
Homosap TRAV20 TRAV20*03 S60789 (F) gaagaccaggtgacgcagagtcccgaggccctgagactccaggagggagagagtcgcagtctcaactgcagttacacagtcagcggtttaagagggctgttctggtataggcaagatcctgggaaaggccctgaattcctcttcaccctgtattcagctggggaagaaaaggagaaagaaaggctaaaagccacattaacaaagaaggaaagctttctgcacatcacagcccctaaacctgaagactcagccacttatctctgt
|
| 34 |
+
Homosap TRAV20 TRAV20*04 X70305 (F) gaagaccaggtgacgcagagtcccgaggccctgagactccaggagggagagagtagcagtctcaactgcagttgcacagtcagcggtttaagagggctgttctggtataggcaagatcctgggaaaggccctgaattcctcttcaccctgtattcagctggggaagaaaaggagaaagaaaggctaaaagccacattaacaaagaaggaaagctttctgcacatcacagcccctaaacctgaagactcagccacttatctctgtgct
|
| 35 |
+
Homosap TRAV21 TRAV21*01 AE000660 F aaacaggaggtgacgcagattcctgcagctctgagtgtcccagaaggagaaaacttggttctcaactgcagtttcactgatagcgctatttacaacctccagtggtttaggcaggaccctgggaaaggtctcacatctctgttgcttattcagtcaagtcagagagagcaaacaagtggaagacttaatgcctcgctggataaatcatcaggacgtagtactttatacattgcagcttctcagcctggtgactcagccacctacctctgtgctgtgagg
|
| 36 |
+
Homosap TRAV21 TRAV21*02 X58736 (F) aaacaggaggtgacacagattcctgcagctctgagtgtcccagaaggagaaaacttggttctcaactgcagtttcactgatagcgctatttacaacctccagtggtttaggcaggaccctgggaaaggtctcacatctctgttgcttattcagtcaagtcagagagagcaaacaagtggaagacttaatgcctcgctggataaatcatcaggacgtagtactttatacattgcagcttctcagcctggtgactcagccacctacctctgtgct
|
| 37 |
+
Homosap TRAV22 TRAV22*01 AE000660 F ggaatacaagtggagcagagtcctccagacctgattctccaggagggagccaattccacgctgcggtgcaatttttctgactctgtgaacaatttgcagtggtttcatcaaaacccttggggacagctcatcaacctgttttacattccctcagggacaaaacagaatggaagattaagcgccacgactgtcgctacggaacgctacagcttattgtacatttcctcttcccagaccacagactcaggcgtttatttctgtgctgtggagc
|
| 38 |
+
Homosap TRAV23/DV6 TRAV23/DV6*01 AE000660 F cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggatttcaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccaacaattccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcattgcatatcatggattcccagcctggagactcagccacctacttctgtgcagcaagca
|
| 39 |
+
Homosap TRAV23/DV6 TRAV23/DV6*02 M17660 (F) cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggattccaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccaacaattccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcattgcatatcatggattcccagcctggagactcagccacctacttctgtgcagcaagcg
|
| 40 |
+
Homosap TRAV23/DV6 TRAV23/DV6*03 M97704 (F) cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggatttcaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccaacagttccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcattgcatatcatggattcccagcctggagactcagccacctacttctgtgcagcaagca
|
| 41 |
+
Homosap TRAV23/DV6 TRAV23/DV6*04 Y10411 [F] cagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggatttcaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccagcaattccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcattgcatatcatggattcccagcctggagactcagccacctacttctgt
|
| 42 |
+
Homosap TRAV23/DV6 TRAV23/DV6*05 IMGT000024 F cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggatttcaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccaacaattccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcatcgcatatcatggattcccagcctggagactcagccacctacttctgtgcagcaagca
|
| 43 |
+
Homosap TRAV24 TRAV24*01 AE000660 F atactgaacgtggaacaaagtcctcagtcactgcatgttcaggagggagacagcaccaatttcacctgcagcttcccttccagcaatttttatgccttacactggtacagatgggaaactgcaaaaagccccgaggccttgtttgtaatgactttaaatggggatgaaaagaagaaaggacgaataagtgccactcttaataccaaggagggttacagctatttgtacatcaaaggatcccagcctgaagactcagccacatacctctgtgccttta
|
| 44 |
+
Homosap TRAV24 TRAV24*02 M17661 (F) atactgaacgtggaacaaggtcctcagtcactgcatgttcaggagggagacagcaccaatttcacctgcagcttcccttccagcaatttttatgccttacactggtacagatgggaaactgccaaaacacccgaggccttgtttgtaatgactttaaatggggatgaaaagaagaaaggacgaataagtgccactcttaataccaaggagggttacagctatttgtacatcaaaggatcccagcctgaagattcagccacatacctctgtgccttta
|
| 45 |
+
Homosap TRAV25 TRAV25*01 AE000660 F ggacaacaggtaatgcaaattcctcagtaccagcatgtacaagaaggagaggacttcaccacgtactgcaattcctcaactactttaagcaatatacagtggtataagcaaaggcctggtggacatcccgtttttttgatacagttagtgaagagtggagaagtgaagaagcagaaaagactgacatttcagtttggagaagcaaaaaagaacagctccctgcacatcacagccacccagactacagatgtaggaacctacttctgtgcaggg
|
| 46 |
+
Homosap TRAV26-1 TRAV26-1*01 AE000660 F gatgctaagaccacccagcccccctccatggattgcgctgaaggaagagctgcaaacctgccttgtaatcactctaccatcagtggaaatgagtatgtgtattggtatcgacagattcactcccaggggccacagtatatcattcatggtctaaaaaacaatgaaaccaatgaaatggcctctctgatcatcacagaagacagaaagtccagcaccttgatcctgccccacgctacgctgagagacactgctgtgtactattgcatcgtcagagtcg
|
| 47 |
+
Homosap TRAV26-1 TRAV26-1*02 IMGT000024 F gatgctaagaccacccagcccacctccatggattgcgctgaaggaagagctgcaaacctgccttgtaatcactctaccatcagtggaaatgagtatgtgtattggtatcgacagattcactcccaggggccacagtatatcattcatggtctaaaaaacaatgaaaccaatgaaatggcctctctgatcatcacagaagacagaaagtccagcaccttgatcctgccccacgctacgctgagagacactgctgtgtactattgcatcgtcagagtcg
|
| 48 |
+
Homosap TRAV26-1 TRAV26-1*03 L06886 (F) gatgctaagaccacccagcccccctccatggattgcgctgaaggaagagctgcaaacctgccttgtaatcactctaccatcagtggaaatgagtatgtgtattggtatcgacagattcactcccaggggccacagaatatcattcatggtctaaaaaacaatgaaaccaatgaaatggcctctctgatcatcacagaagacagaaagtccagcaccttgatcctgccccacgctacgctgagagacactgctgtgtactattgcatc
|
| 49 |
+
Homosap TRAV26-2 TRAV26-2*01 AE000660 F gatgctaagaccacacagccaaattcaatggagagtaacgaagaagagcctgttcacttgccttgtaaccactccacaatcagtggaactgattacatacattggtatcgacagcttccctcccagggtccagagtacgtgattcatggtcttacaagcaatgtgaacaacagaatggcctctctggcaatcgctgaagacagaaagtccagtaccttgatcctgcaccgtgctaccttgagagatgctgctgtgtactactgcatcctgagagac
|
| 50 |
+
Homosap TRAV26-2 TRAV26-2*02 L11160 [F] gatgctaagaccacacagccaaattcaatggagagtaacgaagaagagcctgttcacttgccttgtaaccactccacaatcagtggaactgattacatacattggtatcgacagcttccctcccagggtccagagtacgtgattcatggtcttacaagcaatgtgaacaacagaatggcctgtgtggcaatcgctgaagacagaaagtccagtacct
|
| 51 |
+
Homosap TRAV27 TRAV27*01 AE000660 F acccagctgctggagcagagccctcagtttctaagcatccaagagggagaaaatctcactgtgtactgcaactcctcaagtgttttttccagcttacaatggtacagacaggagcctggggaaggtcctgtcctcctggtgacagtagttacgggtggagaagtgaagaagctgaagagactaacctttcagtttggtgatgcaagaaaggacagttctctccacatcactgcagcccagcctggtgatacaggcctctacctctgtgcaggag
|
| 52 |
+
Homosap TRAV27 TRAV27*02 X04957 (F) acccagctgctggagcagagccctcagtttctaagcatccaagagggagaaaatctcactgtgtactgcaactcctcaagtgttttttccagcttacaatggtacaggcaggagcctggggaaggtcctgtcctcctggtgacagtagttacgggtggagaagtgaagaagctgaagagactaacctttcagtttggtgatgcaagaaaggacagttctctccacatcactgcggcccagcctggtgatacaggccactacctctgtgcagg
|
| 53 |
+
Homosap TRAV27 TRAV27*03 IMGT000024 F acccagctgctggagcagagccctcagtttctaagcatccaagagggagaaaatctcactgtgtactgcaactcctcaagtgttttttccagcttacaatggtacagacaggagcctggggaaggtcctgtcctcctggtgacagtagttacgggtggagaagtgaagaagctgaagagactaacctttcagtttggtgatgcaagaaaggacagttctctccacatcactgcagcccagactggtgatacaggcctctacctctgtgcaggag
|
| 54 |
+
Homosap TRAV29/DV5 TRAV29/DV5*01 AE000660 F gaccagcaagttaagcaaaattcaccatccctgagcgtccaggaaggaagaatttctattctgaactgtgactatactaacagcatgtttgattatttcctatggtacaaaaaataccctgctgaaggtcctacattcctgatatctataagttccattaaggataaaaatgaagatggaagattcactgtcttcttaaacaaaagtgccaagcacctctctctgcacattgtgccctcccagcctggagactctgcagtgtacttctgtgcagcaagcg
|
| 55 |
+
Homosap TRAV29/DV5 TRAV29/DV5*02 S81645 F gaccagcaagttaagcaaaattcaccatccctgagcgtccaggaaggaagaatttctattctgaactgtgactatactaacagcatgtttgattatttcctatggtacaaaaaataccctgctgaaggtcctacattcctgatatctataagttccattaaggataaaaatgaagatggaagattcactgttttcttaaacaaaagtgccaagcacctctctctcgacattgtgccctcccagcctggagactctgcagtgtacttctgtgcagcaagc
|
| 56 |
+
Homosap TRAV29/DV5 TRAV29/DV5*04 IMGT000024 F gaccagcaagttaagcaaaattcaccatccctgagcgtccaggaaggaagaatttctattctgaactgtgactatactaacagcatgtttgattatttcctatggtacaaaaaataccctgctgaaggtcctacattcctgatatctataagttccattaaggataaaaatgaagatggaagattcactgttttcttaaacaaaagtgccaagcacctctctctgcacattgtgccctcccagcctggagactctgcagtgtacttctgtgcagcaagcg
|
| 57 |
+
Homosap TRAV3 TRAV3*01 AE000658 F gctcagtcagtggctcagccggaagatcaggtcaacgttgctgaagggaatcctctgactgtgaaatgcacctattcagtctctggaaacccttatcttttttggtatgttcaataccccaaccgaggcctccagttccttctgaaatacatcacaggggataacctggttaaaggcagctatggctttgaagctgaatttaacaagagccaaacctccttccacctgaagaaaccatctgcccttgtgagcgactccgctttgtacttctgtgctgtgagagaca
|
| 58 |
+
Homosap TRAV30 TRAV30*01 AE000660 F caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcatcaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagaagggtcatgaaaaaatatctgcttcatttaatgaaaaaaagcagcaaagctccctgtaccttacggcctcccagctcagttactcaggaacctacttctgcggcacagaga
|
| 59 |
+
Homosap TRAV30 TRAV30*02 X58768 (F) caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcaccaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagatgcgtcgtgaaaaaatatctgcttcatttaatgaaaaaaagcagcaaagctccctgtaccttacggcctcccagctcagttactcaggaacctacttctgcggg
|
| 60 |
+
Homosap TRAV30 TRAV30*03 L06883 (F) caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcatcaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagaagggtcatgaaaaaatatctgcttcatttaatgaaaaaaagcggcaaagctccctgtaccttacggcctcccagctcagttactcaggaacctacttctgcggc
|
| 61 |
+
Homosap TRAV30 TRAV30*04 U32537 [F] caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcatcaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagaagcgtcatgaaaaaatatctgcttcatttaatgaaaaaaagcagcaaagctccctgtaccttacggc
|
| 62 |
+
Homosap TRAV30 TRAV30*05 IMGT000024 F caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcatcaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagaagggtcatgacaaaatatctgcttcatttaatgaaaaaaagcagcaaagctccctgtaccttacggcctcccagctcagttactcaggaacctacttctgcggcacagaga
|
| 63 |
+
Homosap TRAV34 TRAV34*01 AE000660 F agccaagaactggagcagagtcctcagtccttgatcgtccaagagggaaagaatctcaccataaactgcacgtcatcaaagacgttatatggcttatactggtataagcaaaagtatggtgaaggtcttatcttcttgatgatgctacagaaaggtggggaagagaaaagtcatgaaaagataactgccaagttggatgagaaaaagcagcaaagttccctgcatatcacagcctcccagcccagccatgcaggcatctacctctgtggagcagaca
|
| 64 |
+
Homosap TRAV35 TRAV35*01 AE000660 F ggtcaacagctgaatcagagtcctcaatctatgtttatccaggaaggagaagatgtctccatgaactgcacttcttcaagcatatttaacacctggctatggtacaagcaggaacctggggaaggtcctgtcctcttgatagccttatataaggctggtgaattgacctcaaatggaagactgactgctcagtttggtataaccagaaaggacagcttcctgaatatctcagcatccatacctagtgatgtaggcatctacttctgtgctgggcag
|
| 65 |
+
Homosap TRAV35 TRAV35*02 X58738 (F) ggtcaacagctgaatcagagtcctcaatctatgtttatccaggaaggagaagatgtctccatgaactgcacttcttcaagcatatttaacacctggctatggtacaagcaggaccctggggaaggtcctgtcctcttgatagccttatataaggctggtgaattgacctcaaatggaagactgactgctcagtttggtataaccagaaaggacagcttcctgaatatctcagcatccatacctagtgatgtaggcatctacttctgtgct
|
| 66 |
+
Homosap TRAV36/DV7 TRAV36/DV7*01 AE000660 F gaagacaaggtggtacaaagccctctatctctggttgtccacgagggagacaccgtaactctcaattgcagttatgaagtgactaactttcgaagcctactatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaactttccagcatcctgaacatcacagccacccagaccggagactcggccatctacctctgtgctgtggagg
|
| 67 |
+
Homosap TRAV36/DV7 TRAV36/DV7*02 X61070 (F) gaagacaaggtggtacaaagccctcaatctctggttgtccacgagggagacactgtaactctcaattgcagttatgaaatgactaactttcgaagcctacaatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaacttttcagcatcctgaacatcacagccacccagaccggagactcggccgtctacctctgtgctgtgg
|
| 68 |
+
Homosap TRAV36/DV7 TRAV36/DV7*03 X58767 (F) gaagacaaggtggtacaaagccctctatctctggttgtccacgagggagacactgtaactcccaattgcagttatgaagtgactaactttcgaagcctactatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaacttttcagcatcctgaacatcacagccacccagaccggagactcggccgtctacctctgtgct
|
| 69 |
+
Homosap TRAV36/DV7 TRAV36/DV7*04 Z46643 (F) gaagacaaggtggtacaaagccctctatctctggttgtccacgagggagacactgtaactctcaattgcagttatgaagtgactaactttcgaagcctactatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaacttttcagcatcctgaacatcacagccacccagaccggagactcggccgtctacctctgtgctg
|
| 70 |
+
Homosap TRAV36/DV7 TRAV36/DV7*05 IMGT000024 F gaagacaaggtggtacaaagccctctatctctggttgtccacgagggagacaccgtaactctcaattgcagttatgaagtgactaactttcgaagcctactatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaacttttcagcatcctgaacatcacagccacccagaccggagactcggccatctacctctgtgctgtggagg
|
| 71 |
+
Homosap TRAV38-1 TRAV38-1*01 AE000661 F gcccagacagtcactcagtctcaaccagagatgtctgtgcaggaggcagagactgtgaccctgagttgcacatatgacaccagtgagaataattattatttgttctggtacaagcagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacggagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggacactgcgatgtatttctgtgctttcatgaagca
|
| 72 |
+
Homosap TRAV38-1 TRAV38-1*02 M64355 (F) gcccagacagtcactcagtctcaaccagagatgtctgtgcaggaggcagagactgtgaccctgagttgcacatatgacaccagtgagaatgattattatttgttctggtacaagcagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacggagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggacactgcgatgtatttctgtgctt
|
| 73 |
+
Homosap TRAV38-1 TRAV38-1*03 M95394 (F) gcccagacagtcactcagtctcaaccagagatgtctgtgcaggaggcagagactgtgaccctgagttgcacatatgacaccagtgagagtaattattatttgttctggtacaaacagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacggagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggacactgcgatgtatttctgtgctttca
|
| 74 |
+
Homosap TRAV38-1 TRAV38-1*04 L06880 (F) gcccagacagtcactcagtcccagccagagatgtctgtgcaggaggcagagactgtgaccctgagttgcacatatgacaccagtgagaataattattatttgttctggtacaagcagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacggagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggacactgcgatgtatttctgtgca
|
| 75 |
+
Homosap TRAV38-2/DV8 TRAV38-2/DV8*01 AE000661 F gctcagacagtcactcagtctcaaccagagatgtctgtgcaggaggcagagaccgtgaccctgagctgcacatatgacaccagtgagagtgattattatttattctggtacaagcagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacagagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggatgccgcgatgtatttctgtgcttataggagcg
|
| 76 |
+
Homosap TRAV39 TRAV39*01 AE000661 F gagctgaaagtggaacaaaaccctctgttcctgagcatgcaggagggaaaaaactataccatctactgcaattattcaaccacttcagacagactgtattggtacaggcaggatcctgggaaaagtctggaatctctgtttgtgttgctatcaaatggagcagtgaagcaggagggacgattaatggcctcacttgataccaaagcccgtctcagcaccctccacatcacagctgccgtgcatgacctctctgccacctacttctgtgccgtggaca
|
| 77 |
+
Homosap TRAV4 TRAV4*01 AE000658 F cttgctaagaccacccagcccatctccatggactcatatgaaggacaagaagtgaacataacctgtagccacaacaacattgctacaaatgattatatcacgtggtaccaacagtttcccagccaaggaccacgatttattattcaaggatacaagacaaaagttacaaacgaagtggcctccctgtttatccctgccgacagaaagtccagcactctgagcctgccccgggtttccctgagcgacactgctgtgtactactgcctcgtgggtgaca
|
| 78 |
+
Homosap TRAV40 TRAV40*01 X73521 F agcaattcagtcaagcagacgggccaaataaccgtctcggagggagcatctgtgactatgaactgcacatacacatccacggggtaccctacccttttctggtatgtggaataccccagcaaacctctgcagcttcttcagagagagacaatggaaaacagcaaaaacttcggaggcggaaatattaaagacaaaaactcccccattgtgaaatattcagtccaggtatcagactcagccgtgtactactgtcttctgggaga
|
| 79 |
+
Homosap TRAV41 TRAV41*01 AE000661 F aaaaatgaagtggagcagagtcctcagaacctgactgcccaggaaggagaatttatcacaatcaactgcagttactcggtaggaataagtgccttacactggctgcaacagcatccaggaggaggcattgtttccttgtttatgctgagctcagggaagaagaagcatggaagattaattgccacaataaacatacaggaaaagcacagctccctgcacatcacagcctcccatcccagagactctgccgtctacatctgtgctgtcaga
|
| 80 |
+
Homosap TRAV5 TRAV5*01 AE000659 F ggagaggatgtggagcagagtcttttcctgagtgtccgagagggagacagctccgttataaactgcacttacacagacagctcctccacctacttatactggtataagcaagaacctggagcaggtctccagttgctgacgtatattttttcaaatatggacatgaaacaagaccaaagactcactgttctattgaataaaaaggataaacatctgtctctgcgcattgcagacacccagactggggactcagctatctacttctgtgcagagagta
|
| 81 |
+
Homosap TRAV6 TRAV6*01 AE000659 F agccaaaagatagaacagaattccgaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattccccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgctctagaca
|
| 82 |
+
Homosap TRAV6 TRAV6*02 X58747 (F) agccaaaagatagaacagaattccgaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgct
|
| 83 |
+
Homosap TRAV6 TRAV6*03 Z49060 [F] gaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctacttatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgct
|
| 84 |
+
Homosap TRAV6 TRAV6*04 Y10409 [F] gaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatgtcacagcctcccagcctgcagactcagctacctacctctgtgct
|
| 85 |
+
Homosap TRAV6 TRAV6*05 Y10410 [F] gaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacgaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgct
|
| 86 |
+
Homosap TRAV6 TRAV6*06 U32542 [F] agccaaaagatagaacagaattccgaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaccaga
|
| 87 |
+
Homosap TRAV6 TRAV6*07 IMGT000024 F agccaaaagatagaacagaattccgaggctctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgctctagaca
|
| 88 |
+
Homosap TRAV7 TRAV7*01 AE000659 F gaaaaccaggtggagcacagccctcattttctgggaccccagcagggagacgttgcctccatgagctgcacgtactctgtcagtcgttttaacaatttgcagtggtacaggcaaaatacagggatgggtcccaaacacctattatccatgtattcagctggatatgagaagcagaaaggaagactaaatgctacattactgaagaatggaagcagcttgtacattacagccgtgcagcctgaagattcagccacctatttctgtgctgtagatg
|
| 89 |
+
Homosap TRAV8-1 TRAV8-1*01 AE000659 F gcccagtctgtgagccagcataaccaccacgtaattctctctgaagcagcctcactggagttgggatgcaactattcctatggtggaactgttaatctcttctggtatgtccagtaccctggtcaacaccttcagcttctcctcaagtacttttcaggggatccactggttaaaggcatcaagggctttgaggctgaatttataaagagtaaattctcctttaatctgaggaaaccctctgtgcagtggagtgacacagctgagtacttctgtgccgtgaatgc
|
| 90 |
+
Homosap TRAV8-1 TRAV8-1*02 U32520 [F] gcccagtctgtgagccagcataaccaccacgtaattctctctgaagcagcctcactggagttgggatgcaactattcctatggtggaactgttaatctcttctggtatgtccagtaccctggtcaacaccttcagcttctcctcaagtacttttcaggggatccactggttaaaggcatcaagggcgttgaggctgaatttataaagagtaaattctcctttaatctgaggaaaccctctgtgcagtgga
|
| 91 |
+
Homosap TRAV8-2 TRAV8-2*01 AE000659 F gcccagtcggtgacccagcttgacagccacgtctctgtctctgaaggaaccccggtgctgctgaggtgcaactactcatcttcttattcaccatctctcttctggtatgtgcaacaccccaacaaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgttgtgagtga
|
| 92 |
+
Homosap TRAV8-2 TRAV8-2*02 M17650 (F) gcccagtcggtgacccagcttagcagccacgtctctgtctctgaaggaaccccggtgctgctgaggtgcaactactcatcttcttattcaccatctctcttctggtatgtgcaacaccccaacaaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgttgtga
|
| 93 |
+
Homosap TRAV8-2 TRAV8-2*03 IMGT000024 F gcccagtcggtgacccagcttgacagccacgtctctgtctctgaaggaaccccggtgctgctgaggtgcaactactcatcttcttattcaccgtctctcttctggtatgtgcaacaccccaacaaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgttgtgagtga
|
| 94 |
+
Homosap TRAV8-3 TRAV8-3*01 AE000659 F gcccagtcagtgacccagcctgacatccacatcactgtctctgaaggagcctcactggagttgagatgtaactattcctatggggcaacaccttatctcttctggtatgtccagtcccccggccaaggcctccagctgctcctgaagtacttttcaggagacactctggttcaaggcattaaaggctttgaggctgaatttaagaggagtcaatcttccttcaatctgaggaaaccctctgtgcattggagtgatgctgctgagtacttctgtgctgtgggtgc
|
| 95 |
+
Homosap TRAV8-3 TRAV8-3*02 M35617 (F) gcccagtcagtgacccagcctgacatccacatcactgtctctgaaggagcctcactggagttgagatgtaactattcctatggggcaacaccttatctcttctggtatgtccagtcccccggccaaggcctccagctgctcctgaagtacttttcaggagacactctggttcaaggcattaaaggctttgaggctgaatttaagaggagtcaatcttccttcaacctgaggaaaccctctgtgcattggagtgatgctgctgagtacttctgtgctgtggtt
|
| 96 |
+
Homosap TRAV8-3 TRAV8-3*03 L06885 (F) gcccagtcagtgacccagcctgacatccacatcactgtctctgaaggagcctcactggagttgagatgtaactattcctatggggcaacaccttatctcttctggtatgtccagtcccccggccaaggcctccagctgctcctgaagtacttttcaggagacactctggttcaaggtattaaaggctttgaggctgaatttaagaggagtcaatcttccttcaatctgaggaaaccctctgtgcattggagtgatgcgtctgagtacttctgtgct
|
| 97 |
+
Homosap TRAV8-4 TRAV8-4*01 AE000659 F gcccagtcggtgacccagcttggcagccacgtctctgtctctgaaggagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
|
| 98 |
+
Homosap TRAV8-4 TRAV8-4*02 M12423 (F) gcccagtcggtgacccagcttggcagccacgtctctgtctctgaaggagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacaaaaccctcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
|
| 99 |
+
Homosap TRAV8-4 TRAV8-4*03 D13077 (F) gcccagtcggtgacccagcttggcagccacgtctctgtctctgagggagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacaacaggggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgct
|
| 100 |
+
Homosap TRAV8-4 TRAV8-4*04 M12959 (F) gcccagtcggtgacccagcttggcagccacgtctctgtctctgaacgagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
|
| 101 |
+
Homosap TRAV8-4 TRAV8-4*05 X63455 (F) gcccagtcggtgacccagcttggcagccacgtctctgtctctgaaggagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggaatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
|
| 102 |
+
Homosap TRAV8-4 TRAV8-4*06 K02777 (F) ggtgcaactcactactgctgtccaccaatactcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaacccgcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
|
| 103 |
+
Homosap TRAV8-4 TRAV8-4*07 M17665 (F) gttgaaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacaacaggggccaccctggttaaaggcatcaacggttttgaggctgaatttaaaaagagtgaaacctccttccacctgacgaaaccctcagcccatatgaccgacccggctgagtacttctgtgctgtgag
|
| 104 |
+
Homosap TRAV8-6 TRAV8-6*01 X02850 F gcccagtctgtgacccagcttgacagccaagtccctgtctttgaagaagcccctgtggagctgaggtgcaactactcatcgtctgtttcagtgtatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtatttatcaggatccaccctggttgaaagcatcaacggttttgaggctgaatttaacaagagtcaaacttccttccacttgaggaaaccctcagtccatataagcgacacggctgagtacttctgtgctgtgagtga
|
| 105 |
+
Homosap TRAV8-6 TRAV8-6*02 AE000659 F gcccagtctgtgacccagcttgacagccaagtccctgtctttgaagaagcccctgtggagctgaggtgcaactactcatcgtctgtttcagtgtatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtatttatcaggatccaccctggttaaaggcatcaacggttttgaggctgaatttaacaagagtcaaacttccttccacttgaggaaaccctcagtccatataagcgacacggctgagtacttctgtgctgtgagtga
|
| 106 |
+
Homosap TRAV9-1 TRAV9-1*01 AE000659 F ggagattcagtggtccagacagaaggccaagtgctcccctctgaaggggattccctgattgtgaactgctcctatgaaaccacacagtacccttcccttttttggtatgtccaatatcctggagaaggtccacagctccacctgaaagccatgaaggccaatgacaagggaaggaacaaaggttttgaagccatgtaccgtaaagaaaccacttctttccacttggagaaagactcagttcaagagtcagactccgctgtgtacttctgtgctctgagtga
|
| 107 |
+
Homosap TRAV9-2 TRAV9-2*01 AE000659 F ggaaattcagtgacccagatggaagggccagtgactctctcagaagaggccttcctgactataaactgcacgtacacagccacaggatacccttcccttttctggtatgtccaatatcctggagaaggtctacagctcctcctgaaagccacgaaggctgatgacaagggaagcaacaaaggttttgaagccacataccgtaaagaaaccacttctttccacttggagaaaggctcagttcaagtgtcagactcagcggtgtacttctgtgctctgagtga
|
| 108 |
+
Homosap TRAV9-2 TRAV9-2*02 IMGT000024 F ggagattcagtgacccagatggaagggccagtgactctctcagaagaggccttcctgactataaactgcacgtacacagccacaggatacccttcccttttctggtatgtccaatatcctggagaaggtctacagctcctcctgaaagccacgaaggctgatgacaagggaagcaacaaaggttttgaagccacataccgtaaagaaaccacttctttccacttggagaaaggctcagttcaagtgtcagactcagcggtgtacttctgtgctctgagtga
|
| 109 |
+
Homosap TRAV9-2 TRAV9-2*03 L06881 (F) ggagattcagtgacccagatggaagggccagtgactctctcagaagaggccttcctgactataaactgcacgtacacagccacaggatacccttcccttttctggtatgtccaatatcctggagaaggtctacagctcctcctgaaagccacgaaggctgatgacaagggaagcaacaaaggttttgaagccacataccgtaaggaaaccacttctttccacttggagaaaggctcagttcaagtgtcagactcagcggtgtacttctgtgct
|
| 110 |
+
Homosap TRAV9-2 TRAV9-2*04 L06882 (F) ggaaattcagtgacccagatggaagggccagtgactctctcagaagaggccttcctgactataaactgcacgtacacagccacaggatacccttcccttttctggtatgtccaatatcctggagaaggtctacagctcctcctgaaagccacgaaggctgatgacaagggaagcaacaaaggttttgaagccacataccgtaaggaaaccacttctttccacttggagaaaggctcagttcaagtgtcagactcagcggtgtacttctgtgct
|
src/library/trbjs_aa.tsv
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Species Gene Allele AccNum Functionality aa_seq
|
| 2 |
+
Homosap TRBJ1-1 TRBJ1-1*01 K02545 F NTEAFFGQGTRLTVV
|
| 3 |
+
Homosap TRBJ1-2 TRBJ1-2*01 K02545 F NYGYTFGSGTRLTVV
|
| 4 |
+
Homosap TRBJ1-3 TRBJ1-3*01 M14158 F SGNTIYFGEGSWLTVV
|
| 5 |
+
Homosap TRBJ1-4 TRBJ1-4*01 M14158 F TNEKLFFGSGTQLSVL
|
| 6 |
+
Homosap TRBJ1-5 TRBJ1-5*01 M14158 F SNQPQHFGDGTRLSIL
|
| 7 |
+
Homosap TRBJ1-6 TRBJ1-6*01 M14158 F SYNSPLHFGNGTRLTVT
|
| 8 |
+
Homosap TRBJ1-6 TRBJ1-6*02 L36092 F SYNSPLHFGNGTRLTVT
|
| 9 |
+
Homosap TRBJ2-1 TRBJ2-1*01 X02987 F SYNEQFFGPGTRLTVL
|
| 10 |
+
Homosap TRBJ2-2 TRBJ2-2*01 X02987 F NTGELFFGEGSRLTVL
|
| 11 |
+
Homosap TRBJ2-3 TRBJ2-3*01 X02987 F STDTQYFGPGTRLTVL
|
| 12 |
+
Homosap TRBJ2-4 TRBJ2-4*01 X02987 F AKNIQYFGAGTRLSVL
|
| 13 |
+
Homosap TRBJ2-5 TRBJ2-5*01 X02987 F QETQYFGPGTRLLVL
|
| 14 |
+
Homosap TRBJ2-6 TRBJ2-6*01 X02987 F SGANVLTFGAGSRLTVL
|
| 15 |
+
Homosap TRBJ2-7 TRBJ2-7*01 M14159 F SYEQYFGPGTRLTVT
|
src/library/trbjs_nt.tsv
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Species Gene Allele AccNum Functionality nt_seq
|
| 2 |
+
Homosap TRBJ1-1 TRBJ1-1*01 K02545 F tgaacactgaagctttctttggacaaggcaccagactcacagttgtag
|
| 3 |
+
Homosap TRBJ1-2 TRBJ1-2*01 K02545 F ctaactatggctacaccttcggttcggggaccaggttaaccgttgtag
|
| 4 |
+
Homosap TRBJ1-3 TRBJ1-3*01 M14158 F ctctggaaacaccatatattttggagagggaagttggctcactgttgtag
|
| 5 |
+
Homosap TRBJ1-4 TRBJ1-4*01 M14158 F caactaatgaaaaactgttttttggcagtggaacccagctctctgtcttgg
|
| 6 |
+
Homosap TRBJ1-5 TRBJ1-5*01 M14158 F tagcaatcagccccagcattttggtgatgggactcgactctccatcctag
|
| 7 |
+
Homosap TRBJ1-6 TRBJ1-6*01 M14158 F ctcctataattcacccctccactttgggaatgggaccaggctcactgtgacag
|
| 8 |
+
Homosap TRBJ1-6 TRBJ1-6*02 L36092 F ctcctataattcacccctccactttgggaacgggaccaggctcactgtgacag
|
| 9 |
+
Homosap TRBJ2-1 TRBJ2-1*01 X02987 F ctcctacaatgagcagttcttcgggccagggacacggctcaccgtgctag
|
| 10 |
+
Homosap TRBJ2-2 TRBJ2-2*01 X02987 F cgaacaccggggagctgttttttggagaaggctctaggctgaccgtactgg
|
| 11 |
+
Homosap TRBJ2-3 TRBJ2-3*01 X02987 F agcacagatacgcagtattttggcccaggcacccggctgacagtgctcg
|
| 12 |
+
Homosap TRBJ2-4 TRBJ2-4*01 X02987 F agccaaaaacattcagtacttcggcgccgggacccggctctcagtgctgg
|
| 13 |
+
Homosap TRBJ2-5 TRBJ2-5*01 X02987 F accaagagacccagtacttcgggccaggcacgcggctcctggtgctcg
|
| 14 |
+
Homosap TRBJ2-6 TRBJ2-6*01 X02987 F ctctggggccaacgtcctgactttcggggccggcagcaggctgaccgtgctgg
|
| 15 |
+
Homosap TRBJ2-7 TRBJ2-7*01 M14159 F ctcctacgagcagtacttcgggccgggcaccaggctcacggtcacag
|
src/library/trbvs_aa.tsv
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Species Gene Allele AccNum Functionality aa_seq
|
| 2 |
+
Homosap TRBV10-1 TRBV10-1*01 L36092 F DAEITQSPRHKITETGRQVTLACHQTWNHNNMFWYRQDLGHGLRLIHYSYGVQDTNKGEVSDGYSVSRSNTEDLPLTLESAASSQTSVYFCASSE
|
| 3 |
+
Homosap TRBV10-1 TRBV10-1*02 AF009660 F DAEITQSPRHKITETGRQVTLACHQTWNHNNMFWYRQDLGHGLRLIHYSYGVHDTNKGEVSDGYSVSRSNTEDLPLTLESAASSQTSVYFCASSE
|
| 4 |
+
Homosap TRBV10-2 TRBV10-2*01 L36092 F DAGITQSPRYKITETGRQVTLMCHQTWSHSYMFWYRQDLGHGLRLIYYSAAADITDKGEVPDGYVVSRSKTENFPLTLESATRSQTSVYFCASSE
|
| 5 |
+
Homosap TRBV10-2 TRBV10-2*02 IMGT000021 F DAGITQSPRYKITETGRQVTLMCHQTWSHSYMFWYRQDLGHGLRLIYYSAAADITDKGEVPDGYVVSRSKTENFPLTLESATRSQTSVYFCASSE
|
| 6 |
+
Homosap TRBV10-3 TRBV10-3*01 U03115 F DAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGHGLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFCAISE
|
| 7 |
+
Homosap TRBV10-3 TRBV10-3*02 U17047 F DAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGHGLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFCAISE
|
| 8 |
+
Homosap TRBV10-3 TRBV10-3*03 L33101 [F] DAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGHGLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFC
|
| 9 |
+
Homosap TRBV10-3 TRBV10-3*04 L33102 [F] DAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGHGLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFC
|
| 10 |
+
Homosap TRBV11-1 TRBV11-1*01 M33233 F EAEVAQSPRYKITEKSQAVAFWCDPISGHATLYWYRQILGQGPELLVQFQDESVVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAMYLCASSL
|
| 11 |
+
Homosap TRBV11-2 TRBV11-2*01 L36092 F EAGVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQFQNNGVVDDSQLPKDRFSAERLKGVDSTLKIQPAKLEDSAVYLCASSL
|
| 12 |
+
Homosap TRBV11-2 TRBV11-2*02 M33235 [F] EAGVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQFQNNGVVDDSQLPKDRFSAERLKGVDSTLKIQPAKLENSAVYLCASS
|
| 13 |
+
Homosap TRBV11-2 TRBV11-2*03 IMGT000021 F EAGVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQFQNNGVVDDSQLPKDRFSAERLKGVDSTLKIQPAKLEDSAVYLCASSL
|
| 14 |
+
Homosap TRBV11-3 TRBV11-3*01 U03115 F EAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYLQNLGQGPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASSL
|
| 15 |
+
Homosap TRBV11-3 TRBV11-3*02 X58797 (F) EAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYRQNLGQGPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASS
|
| 16 |
+
Homosap TRBV11-3 TRBV11-3*04 AB305924 (F) EAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYRQNLGQGPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASSL
|
| 17 |
+
Homosap TRBV12-3 TRBV12-3*01 X07192 F DAGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMRGLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL
|
| 18 |
+
Homosap TRBV12-4 TRBV12-4*01 K02546 F DAGVIQSPRHEVTEMGQEVTLRCKPISGHDYLFWYRQTMMRGLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL
|
| 19 |
+
Homosap TRBV12-4 TRBV12-4*02 M14264 (F) DAGVIQSPRHEVTEMGQEVTLRCKPISGHDYLFWYRQTMMRGLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLRIQPSEPRDSAVYFCASSL
|
| 20 |
+
Homosap TRBV12-5 TRBV12-5*01 X07223 F DARVTQTPRHKVTEMGQEVTMRCQPILGHNTVFWYRQTMMQGLELLAYFRNRAPLDDSGMPKDRFSAEMPDATLATLKIQPSEPRDSAVYFCASGL
|
| 21 |
+
Homosap TRBV13 TRBV13*01 U03115 F AAGVIQSPRHLIKEKRETATLKCYPIPRHDTVYWYQQGPGQDPQFLISFYEKMQSDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYFCASSL
|
| 22 |
+
Homosap TRBV13 TRBV13*02 M62378 (F) AAGVIQSPRHLIREKRETATLKCYPIPRHDTVYWYQQGPGQDPQFFISFYEKMQSDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYFCASS
|
| 23 |
+
Homosap TRBV14 TRBV14*01 X06154 F EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSQ
|
| 24 |
+
Homosap TRBV14 TRBV14*02 X57722 (F) EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASS
|
| 25 |
+
Homosap TRBV15 TRBV15*01 U03115 F DAMVIQNPRYQVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQAPKLLFHYYDKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDTAMYLCATSR
|
| 26 |
+
Homosap TRBV15 TRBV15*02 IMGT000021 F DAMVIQNPRYQVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQAPKLLFHYYDKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDAAMYLCATSR
|
| 27 |
+
Homosap TRBV15 TRBV15*03 M62376 (F) DAMVIQNPRYRVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQAPKLLFHYYNKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDAAMYQCATS
|
| 28 |
+
Homosap TRBV16 TRBV16*01 L26231 F GEEVAQTPKHLVRGEGQKAKLYCAPIKGHSYVFWYQQVLKNEFKFLISFQNENVFDETGMPKERFSAKCLPNSPCSLEIQATKLEDSAVYFCASSQ
|
| 29 |
+
Homosap TRBV16 TRBV16*03 L26054 (F) GEEVAQTPKHLVRGEGQKAKLYCAPIKGHSYVFWYQQVLKNEFKFLVSFQNENVFDETGMPKERFSAKCLPNSPCSLEIQATKLEDSAVYFCASS
|
| 30 |
+
Homosap TRBV18 TRBV18*01 L36092 F NAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEEGLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSP
|
| 31 |
+
Homosap TRBV19 TRBV19*01 L36092 F DGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQGLRLIYYSQIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCASSI
|
| 32 |
+
Homosap TRBV19 TRBV19*02 U48259 F DGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQVPGQGLRLIYYSHIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCASSI
|
| 33 |
+
Homosap TRBV19 TRBV19*03 M97725 (F) DGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQGLRLIYYSHIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCAS
|
| 34 |
+
Homosap TRBV2 TRBV2*01 L36092 F EPEVTQTPSHQVTQMGQEVILRCVPISNHLYFYWYRQILGQKVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASSE
|
| 35 |
+
Homosap TRBV2 TRBV2*02 M62379 (F) EPEVTQTPSHQVTQMGQEVILHCVPISNHLYFYWYRQILGQKVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASS
|
| 36 |
+
Homosap TRBV2 TRBV2*03 M64351 (F) EPEVTQTPSHQVTQMGQEVILRCVPISNHLYFYWYRQILGQKVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASSE
|
| 37 |
+
Homosap TRBV20-1 TRBV20-1*01 M11955 F GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR
|
| 38 |
+
Homosap TRBV20-1 TRBV20-1*02 X72719 F GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSA
|
| 39 |
+
Homosap TRBV20-1 TRBV20-1*03 M11954 (F) GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGCKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSA
|
| 40 |
+
Homosap TRBV20-1 TRBV20-1*04 M14263 (F) GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAS
|
| 41 |
+
Homosap TRBV20-1 TRBV20-1*05 X57604 (F) GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR
|
| 42 |
+
Homosap TRBV20-1 TRBV20-1*06 D13088 (F) GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSA
|
| 43 |
+
Homosap TRBV20-1 TRBV20-1*07 X74852 (F) GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMQIATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR
|
| 44 |
+
Homosap TRBV20/OR9-2 TRBV20/OR9-2*01 L05149 (F) SAVVSQHPSRVICKSGTSVNIECRSLDFQATTMFWYRQLRKQSLMLMATSNEGSEVTYEQGVKKDKFPINHPNLTFSALTVTSAHPEDSSFYICSAR
|
| 45 |
+
Homosap TRBV20/OR9-2 TRBV20/OR9-2*03 L05149 (F) SAVVSQHPSRVICKSGTSVNIECRSLDFQATTMFWYRQLRKQSLMLMAASNEGSEVTYEQGVKKDKFPINHPNLTFSALTVTSAHPEDSSFYICSAR
|
| 46 |
+
Homosap TRBV21-1 TRBV21-1*01 AF029308 (F) DTKVTQRPRFLVKANEQKAKMDCVPIKRHSYVYWYHKTLEEELKFFIYFQNEEIIQKAEIINERFSAQCPQNSPCTLEIQSTESGDTARYFCANSK
|
| 47 |
+
Homosap TRBV23-1 TRBV23-1*01 L36092 (F) HAKVTQTPGHLVKGKGQKTKMDCTPEKGHTFVYWYQQNQNKEFMLLISFQNEQVLQETEMHKKRFSSQCPKNAPCSLAILSSEPGDTALYLCASSQ
|
| 48 |
+
Homosap TRBV24-1 TRBV24-1*01 M11951 F DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL
|
| 49 |
+
Homosap TRBV24-1 TRBV24-1*02 IMGT000021 F DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLQLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL
|
| 50 |
+
Homosap TRBV24/OR9-2 TRBV24/OR9-2*01 L05153 F DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLQLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL
|
| 51 |
+
Homosap TRBV25-1 TRBV25-1*01 L36092 F EADIYQTPRYLVIGTGKKITLECSQTMGHDKMYWYQQDPGMELHLIHYSYGVNSTEKGDLSSESTVSRIRTEHFPLTLESARPSHTSQYLCASSE
|
| 52 |
+
Homosap TRBV27 TRBV27*01 L36092 F EAQVTQNPRYLITVTGKKLTVTCSQNMNHEYMSWYRQDPGLGLRQIYYSMNVEVTDKGDVPEGYKVSRKEKRNFPLILESPSPNQTSLYFCASSL
|
| 53 |
+
Homosap TRBV28 TRBV28*01 U08314 F DVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGLGLRLIYFSYDVKMKEKGDIPEGYSVSREKKERFSLILESASTNQTSMYLCASSL
|
| 54 |
+
Homosap TRBV29-1 TRBV29-1*01 L36092 F SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVE
|
| 55 |
+
Homosap TRBV29-1 TRBV29-1*02 M13847 (F) SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSSLTVSNMSPEDSSIYLCSVE
|
| 56 |
+
Homosap TRBV3-1 TRBV3-1*01 U07977 F DTAVSQTPKYLVTQMGNDKSIKCEQNLGHDTMYWYKQDSKKFLKIMFSYNNKELIINETVPNRFSPKSPDKAHLNLHINSLELGDSAVYFCASSQ
|
| 57 |
+
Homosap TRBV3-1 TRBV3-1*02 L06889 (F) DTAVSQTPKYLVTQMGNDKSIKCEQNLGHDTMYWYKQDSKKFLKIMFSYNNKEIIINETVPNRFSPKSPDKAKLNLHINSLELGDSAVYFCAS
|
| 58 |
+
Homosap TRBV30 TRBV30*01 L36092 F SQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRGLQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWS
|
| 59 |
+
Homosap TRBV30 TRBV30*02 Z13967 F SQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRGLQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWS
|
| 60 |
+
Homosap TRBV30 TRBV30*05 L06893 (F) SQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRGLQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWG
|
| 61 |
+
Homosap TRBV4-1 TRBV4-1*01 U07977 F DTEVTQTPKHLVMGMTNKKSLKCEQHMGHRAMYWYKQKAKKPPELMFVYSYEKLSINESVPSRFSPECPNSSLLNLHLHALQPEDSALYLCASSQ
|
| 62 |
+
Homosap TRBV4-2 TRBV4-2*01 U07975 F ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYNFKEQTENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ
|
| 63 |
+
Homosap TRBV4-2 TRBV4-2*02 X58811 (F) ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYNFKEQTENNSVPSRFSPECPNSSHLCLHLHTLQPEDSALYLCAST
|
| 64 |
+
Homosap TRBV4-3 TRBV4-3*01 U07978 F ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYSLEERVENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ
|
| 65 |
+
Homosap TRBV4-3 TRBV4-3*02 X58812 (F) ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYSLEERVENNSVPSRFSPECPNSSHLSLHLHTLQPEDSALYLCASS
|
| 66 |
+
Homosap TRBV4-3 TRBV4-3*03 L06888 (F) ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYSLEERVENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASS
|
| 67 |
+
Homosap TRBV5-1 TRBV5-1*01 L36092 F KAGVTQTPRYLIKTRGQQVTLSCSPISGHRSVSWYQQTPGQGLQFLFEYFSETQRNKGNFPGRFSGRQFSNSRSEMNVSTLELGDSALYLCASSL
|
| 68 |
+
Homosap TRBV5-1 TRBV5-1*02 M14271 (F) RAGVTQTPRHLIKTRGQQVTLGCSPISGHRSVSWYQQTLGQGLQFLFEYFSETQRNKGNFLGRFSGRQFSNSRSEMNVSTLELGDSALYLCAS
|
| 69 |
+
Homosap TRBV5-3 TRBV5-3*01 L36092 F EAGVTQSPTHLIKTRGQQVTLRCSPISGHSSVSWYQQAPGQGPQFIFEYANELRRSEGNFPNRFSGRQFHDCCSEMNVSALELGDSALYLCARSL
|
| 70 |
+
Homosap TRBV5-4 TRBV5-4*01 L36092 F ETGVTQSPTHLIKTRGQQVTLRCSSQSGHNTVSWYQQALGQGPQFIFQYYREEENGRGNFPPRFSGLQFPNYSSELNVNALELDDSALYLCASSL
|
| 71 |
+
Homosap TRBV5-4 TRBV5-4*02 X57615 (F) ETGVTQSPTHLIKTRGQQVTLRCSSQSGHNTVSWYQQALGQGPQFIFQYYREEENGRGNFPPRFSGLQFPNYNSELNVNALELDDSALYLCASS
|
| 72 |
+
Homosap TRBV5-5 TRBV5-5*01 L36092 F DAGVTQSPTHLIKTRGQQVTLRCSPISGHKSVSWYQQVLGQGPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASSL
|
| 73 |
+
Homosap TRBV5-5 TRBV5-5*02 X57611 (F) DAGVTQSPTHLIKTRGQHVTLRCSPISGHKSVSWYQQVLGQGPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASS
|
| 74 |
+
Homosap TRBV5-5 TRBV5-5*03 X58801 (F) DAGVTQSPTHLIKTRGQQVTLRCSPISEHKSVSWYQQVLGQGPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASS
|
| 75 |
+
Homosap TRBV5-6 TRBV5-6*01 L36092 F DAGVTQSPTHLIKTRGQQVTLRCSPKSGHDTVSWYQQALGQGPQFIFQYYEEEERQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYLCASSL
|
| 76 |
+
Homosap TRBV5-7 TRBV5-7*01 L36092 F DAGVTQSPTHLIKTRGQHVTLRCSPISGHTSVSSYQQALGQGPQFIFQYYEKEERGRGNFPDQFSGHQFPNYSSELNVNALLLGDSALYLCASSL
|
| 77 |
+
Homosap TRBV5-8 TRBV5-8*01 L36092 F EAGVTQSPTHLIKTRGQQATLRCSPISGHTSVYWYQQALGLGLQFLLWYDEGEERNRGNFPPRFSGRQFPNYSSELNVNALELEDSALYLCASSL
|
| 78 |
+
Homosap TRBV6-1 TRBV6-1*01 X61446 F NAGVTQTPKFQVLKTGQSMTLQCAQDMNHNSMYWYRQDPGMGLRLIYYSASEGTTDKGEVPNGYNVSRLNKREFSLRLESAAPSQTSVYFCASSE
|
| 79 |
+
Homosap TRBV6-2 TRBV6-2*01 X61445 F NAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGMGLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY
|
| 80 |
+
Homosap TRBV6-3 TRBV6-3*01 U07978 F NAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGMGLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY
|
| 81 |
+
Homosap TRBV6-4 TRBV6-4*01 X61653 F IAGITQAPTSQILAAGRRMTLRCTQDMRHNAMYWYRQDLGLGLRLIHYSNTAGTTGKGEVPDGYSVSRANTDDFPLTLASAVPSQTSVYFCASSD
|
| 82 |
+
Homosap TRBV6-4 TRBV6-4*02 AF009660 F TAGITQAPTSQILAAGRSMTLRCTQDMRHNAMYWYRQDLGLGLRLIHYSNTAGTTGKGEVPDGYSVSRANTDDFPLTLASAVPSQTSVYFCASSD
|
| 83 |
+
Homosap TRBV6-5 TRBV6-5*01 L36092 F NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSY
|
| 84 |
+
Homosap TRBV6-6 TRBV6-6*01 L36092 F NAGVTQTPKFRILKIGQSMTLQCTQDMNHNYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASSY
|
| 85 |
+
Homosap TRBV6-6 TRBV6-6*02 AF009662 F NAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASSY
|
| 86 |
+
Homosap TRBV6-6 TRBV6-6*03 X58815 (F) NAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASS
|
| 87 |
+
Homosap TRBV6-6 TRBV6-6*04 X74848 (F) NAGVTQTPKFRILKIGQSMTLQCTQDMNHEYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASSR
|
| 88 |
+
Homosap TRBV6-6 TRBV6-6*05 L06892 (F) NAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAASQTSVYFCASS
|
| 89 |
+
Homosap TRBV6-7 TRBV6-7*01 L36092 F NAGVTQTPKFHVLKTGQSMTLLCAQDMNHEYMYRYRQDPGKGLRLIYYSVAAALTDKGEVPNGYNVSRSNTEDFPLKLESAAPSQTSVYFCASSY
|
| 90 |
+
Homosap TRBV6-8 TRBV6-8*01 L36092 F NAGVTQTPKFHILKTGQSMTLQCAQDMNHGYMSWYRQDPGMGLRLIYYSAAAGTTDKEVPNGYNVSRLNTEDFPLRLVSAAPSQTSVYLCASSY
|
| 91 |
+
Homosap TRBV6-9 TRBV6-9*01 X61447 F NAGVTQTPKFHILKTGQSMTLQCAQDMNHGYLSWYRQDPGMGLRRIHYSVAAGITDKGEVPDGYNVSRSNTEDFPLRLESAAPSQTSVYFCASSY
|
| 92 |
+
Homosap TRBV7-1 TRBV7-1*01 X61444 F GAGVSQSLRHKVAKKGKDVALRYDPISGHNALYWYRQSLGQGLEFPIYFQGKDAADKSGLPRDRFSAQRSEGSISTLKFQRTQQGDLAVYLCASSS
|
| 93 |
+
Homosap TRBV7-2 TRBV7-2*01 X61442 F GAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGGSVSTLTIQRTQQEDSAVYLCASSL
|
| 94 |
+
Homosap TRBV7-2 TRBV7-2*02 L36190 F GAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQRLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGESVSTLTIQRTQQEDSAVYLCASSL
|
| 95 |
+
Homosap TRBV7-2 TRBV7-2*03 U07975 F GAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQRLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGESVSTLTIQRTQQEDSAVYLCTSSL
|
| 96 |
+
Homosap TRBV7-2 TRBV7-2*04 M27387 (F) GAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGGSVSTLTIQRTQQEDSAVYLCASSL
|
| 97 |
+
Homosap TRBV7-3 TRBV7-3*01 X61440 F GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFLIYFQGTGAADDSGLPNDRFFAVRPEGSVSTLKIQRTERGDSAVYLCASSL
|
| 98 |
+
Homosap TRBV7-3 TRBV7-3*04 X74843 (F) GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFLIYFQGTGAADDSGLPNDRFFAVRPEGSVSTLKIQRTERGDSAVYLCASS
|
| 99 |
+
Homosap TRBV7-4 TRBV7-4*01 L36092 F GAGVSQSPRYKVAKRGRDVALRCDSISGHVTLYWYRQTLGQGSEVLTYSQSDAQRDKSGRPSGRFSAERPERSVSTLKIQRTEQGDSAVYLCASSL
|
| 100 |
+
Homosap TRBV7-6 TRBV7-6*01 L36092 F GAGVSQSPRYKVTKRGQDVALRCDPISGHVSLYWYRQALGQGPEFLTYFNYEAQQDKSGLPNDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL
|
| 101 |
+
Homosap TRBV7-6 TRBV7-6*02 X58806 (F) GAGVSQSPRYKVTKRGQDVALRCDPISGHVSLYWYRQALGQGPEFLTYFNYEAQQDKSGLPNDRFSAERPEGSISTLTIQRTEQRDSAMYRCASS
|
| 102 |
+
Homosap TRBV7-7 TRBV7-7*01 L36092 F GAGVSQSPRYKVTKRGQDVTLRCDPISSHATLYWYQQALGQGPEFLTYFNYEAQPDKSGLPSDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL
|
| 103 |
+
Homosap TRBV7-7 TRBV7-7*02 X57607 (F) GAGVSQSPRYKVTKRGQDVTLRCDPISSHVTLYWYQQALGQGPEFLTYFNYEAQPDKSGLPSDRFSAERPEGSISTLTIQRTEQRDSAMYRCASS
|
| 104 |
+
Homosap TRBV7-8 TRBV7-8*01 M11953 F GAGVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTYFQNEAQLDKSGLPSDRFFAERPEGSVSTLKIQRTQQEDSAVYLCASSL
|
| 105 |
+
Homosap TRBV7-8 TRBV7-8*02 X61441 F GAGVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTYFQNEAQLDKSGLPSDRFFAERPEGSVSTLKIQRTQKEDSAVYLCASSL
|
| 106 |
+
Homosap TRBV7-8 TRBV7-8*03 M27384 (F) GAGVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTYFQNEAQLDKSGLPSDRFFAERPEGSVSTLKIQRTQQEDSAVYLCASSR
|
| 107 |
+
Homosap TRBV7-9 TRBV7-9*01 L36092 F DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL
|
| 108 |
+
Homosap TRBV7-9 TRBV7-9*02 M15564 (F) DTGVSQNPRHNITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL
|
| 109 |
+
Homosap TRBV7-9 TRBV7-9*03 AF009663 F DTGVSQDPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL
|
| 110 |
+
Homosap TRBV7-9 TRBV7-9*04 M14261 (F) ISGVSHNPRHKITKRGQNVTFRCDPISEHNRLYWYRQNPGQGPEFLTYFQNEAQLEKSGLLSDRISAERPKGSFSTLEIQRTEQGDSAMYLCASS
|
| 111 |
+
Homosap TRBV7-9 TRBV7-9*05 M27385 (F) DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSLSTLEIQRTEQGDSAMYLCASTK
|
| 112 |
+
Homosap TRBV7-9 TRBV7-9*06 X74844 (F) DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSLSTLEIQRTEQGDSAMYLCASTL
|
| 113 |
+
Homosap TRBV9 TRBV9*01 L36092 F DSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASSV
|
| 114 |
+
Homosap TRBV9 TRBV9*02 AF009660 F DSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIHYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASSV
|
| 115 |
+
Homosap TRBV9 TRBV9*03 M27380 (F) DSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASS
|
src/library/trbvs_nt.tsv
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Species Gene Allele AccNum Functionality nt_seq
|
| 2 |
+
Homosap TRBV10-1 TRBV10-1*01 L36092 F gatgctgaaatcacccagagcccaagacacaagatcacagagacaggaaggcaggtgaccttggcgtgtcaccagacttggaaccacaacaatatgttctggtatcgacaagacctgggacatgggctgaggctgatccattactcatatggtgttcaagacactaacaaaggagaagtctcagatggctacagtgtctctagatcaaacacagaggacctccccctcactctggagtctgctgcctcctcccagacatctgtatatttctgcgccagcagtgagtc
|
| 3 |
+
Homosap TRBV10-1 TRBV10-1*02 AF009660 F gatgctgaaatcacccagagcccaagacacaagatcacagagacaggaaggcaggtgaccttggcgtgtcaccagacttggaaccacaacaatatgttctggtatcgacaagacctgggacatgggctgaggctgatccattactcatatggtgttcacgacactaacaaaggagaagtctcagatggctacagtgtctctagatcaaacacagaggacctccccctcactctggagtctgctgcctcctcccagacatctgtatatttctgcgccagcagtgagtc
|
| 4 |
+
Homosap TRBV10-2 TRBV10-2*01 L36092 F gatgctggaatcacccagagcccaagatacaagatcacagagacaggaaggcaggtgaccttgatgtgtcaccagacttggagccacagctatatgttctggtatcgacaagacctgggacatgggctgaggctgatctattactcagcagctgctgatattacagataaaggagaagtccccgatggctatgttgtctccagatccaagacagagaatttccccctcactctggagtcagctacccgctcccagacatctgtgtatttctgcgccagcagtgagtc
|
| 5 |
+
Homosap TRBV10-2 TRBV10-2*02 IMGT000021 F gatgctggaatcacccagagcccaagatacaagatcacagagacaggaaggcaggtgaccttgatgtgtcaccagacttggagccacagctatatgttctggtatcgacaagacctgggacatgggctgaggctgatctattactcagcagctgctgatattacagataaaggagaagtccccgatggctacgttgtctccagatccaagacagagaatttccccctcactctggagtcagctacccgctcccagacatctgtgtatttctgcgccagcagtgagtc
|
| 6 |
+
Homosap TRBV10-3 TRBV10-3*01 U03115 F gatgctggaatcacccagagcccaagacacaaggtcacagagacaggaacaccagtgactctgagatgtcaccagactgagaaccaccgctatatgtactggtatcgacaagacccggggcatgggctgaggctgatccattactcatatggtgttaaagatactgacaaaggagaagtctcagatggctatagtgtctctagatcaaagacagaggatttcctcctcactctggagtccgctaccagctcccagacatctgtgtacttctgtgccatcagtgagtc
|
| 7 |
+
Homosap TRBV10-3 TRBV10-3*02 U17047 F gatgctggaatcacccagagcccaagacacaaggtcacagagacaggaacaccagtgactctgagatgtcatcagactgagaaccaccgctatatgtactggtatcgacaagacccggggcatgggctgaggctgatccattactcatatggtgttaaagatactgacaaaggagaagtctcagatggctatagtgtctctagatcaaagacagaggatttcctcctcactctggagtccgctaccagctcccagacatctgtgtacttctgtgccatcagtgagtc
|
| 8 |
+
Homosap TRBV10-3 TRBV10-3*03 L33101 [F] gatgctggaatcacccagagcccaagacacaaggtcacagagacaggaacaccagtgactctgagatgtcaccagactgagaaccaccgctacatgtactggtatcgacaagacccggggcatgggctgaggctaatccattactcatatggtgttaaagatactgacaaaggagaagtctcagatggctatagtgtctctagatcaaagacagaggatttcctcctcactctggagtccgctaccagctcccagacatctgtgtacttctgt
|
| 9 |
+
Homosap TRBV10-3 TRBV10-3*04 L33102 [F] gatgctggaatcacccagagcccaagacacaaggtcacagagacaggaacaccagtgactctgagatgtcaccagactgagaaccaccgctacatgtactggtatcgacaagacccggggcatgggctgaggctgatccattactcatatggtgttaaagatactgacaaaggagaagtctcagatggctatagtgtctctagatcaaagacagaggatttcctcctcactctggagtccgctaccagctcccagacatctgtgtacttctgt
|
| 10 |
+
Homosap TRBV11-1 TRBV11-1*01 M33233 F gaagctgaagttgcccagtcccccagatataagattacagagaaaagccaggctgtggctttttggtgtgatcctatttctggccatgctaccctttactggtaccggcagatcctgggacagggcccggagcttctggttcaatttcaggatgagagtgtagtagatgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcagagcttggggactcggccatgtatctctgtgccagcagcttagc
|
| 11 |
+
Homosap TRBV11-2 TRBV11-2*01 L36092 F gaagctggagttgcccagtctcccagatataagattatagagaaaaggcagagtgtggctttttggtgcaatcctatatctggccatgctaccctttactggtaccagcagatcctgggacagggcccaaagcttctgattcagtttcagaataacggtgtagtggatgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcaaagcttgaggactcggccgtgtatctctgtgccagcagcttaga
|
| 12 |
+
Homosap TRBV11-2 TRBV11-2*02 M33235 [F] gaagctggagttgcccagtctcccagatataagattatagagaaaaggcagagtgtggctttttggtgcaatcctatatctggccatgctaccctttactggtaccagcagatcctgggacagggcccaaagcttctgattcagtttcagaataacggtgtagtggatgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcaaagcttgagaactcggccgtgtatctctgtgccagcagt
|
| 13 |
+
Homosap TRBV11-2 TRBV11-2*03 IMGT000021 F gaagctggagttgcccagtctcccagatataagattatagagaaaaggcagagtgtggctttttggtgcaatcctatatctggccatgctaccctttactggtaccagcagatcctgggacagggcccaaagcttctgattcagtttcagaataacggtgtagtggatgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccaacctgcaaagcttgaggactcggccgtgtatctctgtgccagcagcttaga
|
| 14 |
+
Homosap TRBV11-3 TRBV11-3*01 U03115 F gaagctggagtggttcagtctcccagatataagattatagagaaaaaacagcctgtggctttttggtgcaatcctatttctggccacaataccctttactggtacctgcagaacttgggacagggcccggagcttctgattcgatatgagaatgaggaagcagtagacgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcagagcttggggactcggccgtgtatctctgtgccagcagcttaga
|
| 15 |
+
Homosap TRBV11-3 TRBV11-3*02 X58797 (F) gaagctggagtggttcagtctcccagatataagattatagagaaaaagcagcctgtggctttttggtgcaatcctatttctggccacaataccctttactggtaccggcagaacttgggacagggcccggagcttctgattcgatatgagaatgaggaagcagtagacgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcagagcttggggactcggccgtgtatctctgtgccagcagc
|
| 16 |
+
Homosap TRBV11-3 TRBV11-3*04 AB305924 (F) gaagctggagtggttcagtctcccagatataagattatagagaaaaaacagcctgtggctttttggtgcaatcctatttctggccacaataccctttactggtaccggcagaacttgggacagggcccggagcttctgattcgatatgagaatgaggaagcagtagacgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcagagcttggggactcggccgtgtatctctgtgccagcagcttag
|
| 17 |
+
Homosap TRBV12-3 TRBV12-3*01 X07192 F gatgctggagttatccagtcaccccgccatgaggtgacagagatgggacaagaagtgactctgagatgtaaaccaatttcaggccacaactcccttttctggtacagacagaccatgatgcggggactggagttgctcatttactttaacaacaacgttccgatagatgattcagggatgcccgaggatcgattctcagctaagatgcctaatgcatcattctccactctgaagatccagccctcagaacccagggactcagctgtgtacttctgtgccagcagtttagc
|
| 18 |
+
Homosap TRBV12-4 TRBV12-4*01 K02546 F gatgctggagttatccagtcaccccggcacgaggtgacagagatgggacaagaagtgactctgagatgtaaaccaatttcaggacacgactaccttttctggtacagacagaccatgatgcggggactggagttgctcatttactttaacaacaacgttccgatagatgattcagggatgcccgaggatcgattctcagctaagatgcctaatgcatcattctccactctgaagatccagccctcagaacccagggactcagctgtgtacttctgtgccagcagtttagc
|
| 19 |
+
Homosap TRBV12-4 TRBV12-4*02 M14264 (F) gatgctggagttatccagtcaccccggcacgaggtgacagagatgggacaagaagtgactctgagatgtaaaccaatttcaggacatgactaccttttctggtacagacagaccatgatgcggggactggagttgctcatttactttaacaacaacgttccgatagatgattcagggatgcccgaggatcgattctcagctaagatgcctaatgcatcattctccactctgaggatccagccctcagaacccagggactcagctgtgtacttctgtgccagcagttta
|
| 20 |
+
Homosap TRBV12-5 TRBV12-5*01 X07223 F gatgctagagtcacccagacaccaaggcacaaggtgacagagatgggacaagaagtaacaatgagatgtcagccaattttaggccacaatactgttttctggtacagacagaccatgatgcaaggactggagttgctggcttacttccgcaaccgggctcctctagatgattcggggatgccgaaggatcgattctcagcagagatgcctgatgcaactttagccactctgaagatccagccctcagaacccagggactcagctgtgtatttttgtgctagtggtttggt
|
| 21 |
+
Homosap TRBV13 TRBV13*01 U03115 F gctgctggagtcatccagtccccaagacatctgatcaaagaaaagagggaaacagccactctgaaatgctatcctatccctagacacgacactgtctactggtaccagcagggtccaggtcaggacccccagttcctcatttcgttttatgaaaagatgcagagcgataaaggaagcatccctgatcgattctcagctcaacagttcagtgactatcattctgaactgaacatgagctccttggagctgggggactcagccctgtacttctgtgccagcagcttagg
|
| 22 |
+
Homosap TRBV13 TRBV13*02 M62378 (F) gctgctggagtcatccagtccccaagacatctgatcagagaaaagagggaaacagccactctgaaatgctatcctatccctagacacgacactgtctactggtaccagcagggcccaggtcaggacccccagttcttcatttcgttttatgaaaagatgcagagcgataaaggaagcatccctgatcgattctcagctcaacagttcagtgactatcattctgaactgaacatgagctccttggagctgggggactcagccctgtacttctgtgccagcagc
|
| 23 |
+
Homosap TRBV14 TRBV14*01 X06154 F gaagctggagttactcagttccccagccacagcgtaatagagaagggccagactgtgactctgagatgtgacccaatttctggacatgataatctttattggtatcgacgtgttatgggaaaagaaataaaatttctgttacattttgtgaaagagtctaaacaggatgagtccggtatgcccaacaatcgattcttagctgaaaggactggagggacgtattctactctgaaggtgcagcctgcagaactggaggattctggagtttatttctgtgccagcagccaaga
|
| 24 |
+
Homosap TRBV14 TRBV14*02 X57722 (F) gaagctggagttactcagttccccagccacagcgtaatagagaagggccagactgtgactctgagatgtgacccaatttctggacatgataatctttattggtatcgacgtgttatgggaaaagaaataaaatttctgttacattttgtgaaagagtctaaacaggatgaatccggtatgcccaacaatcgattcttagctgaaaggactggagggacgtattctactctgaaggtgcagcctgcagaactggaggattctggagtttatttctgtgccagcagc
|
| 25 |
+
Homosap TRBV15 TRBV15*01 U03115 F gatgccatggtcatccagaacccaagataccaggttacccagtttggaaagccagtgaccctgagttgttctcagactttgaaccataacgtcatgtactggtaccagcagaagtcaagtcaggccccaaagctgctgttccactactatgacaaagattttaacaatgaagcagacacccctgataacttccaatccaggaggccgaacacttctttctgctttcttgacatccgctcaccaggcctgggggacacagccatgtacctgtgtgccaccagcagaga
|
| 26 |
+
Homosap TRBV15 TRBV15*02 IMGT000021 F gatgccatggtcatccagaacccaagataccaggttacccagtttggaaagccagtgaccctgagttgttctcagactttgaaccataacgtcatgtactggtaccagcagaagtcaagtcaggccccaaagctgctgttccactactatgacaaagattttaacaatgaagcagacacccctgataacttccaatccaggaggccgaacacttctttctgctttcttgacatccgctcaccaggcctgggggacgcagccatgtacctgtgtgccaccagcagaga
|
| 27 |
+
Homosap TRBV15 TRBV15*03 M62376 (F) gatgccatggtcatccagaacccaagataccgggttacccagtttggaaagccagtgaccctgagttgttctcagactttgaaccataacgtcatgtactggtaccagcagaagtcaagtcaggccccaaagctgctgttccactactataacaaagattttaacaatgaagcagacacccctgataacttccaatccaggaggccgaacacttctttctgctttctagacatccgctcaccaggcctgggggacgcagccatgtaccagtgtgccaccagc
|
| 28 |
+
Homosap TRBV16 TRBV16*01 L26231 F ggtgaagaagtcgcccagactccaaaacatcttgtcagaggggaaggacagaaagcaaaattatattgtgccccaataaaaggacacagttatgttttttggtaccaacaggtcctgaaaaacgagttcaagttcttgatttccttccagaatgaaaatgtctttgatgaaacaggtatgcccaaggaaagattttcagctaagtgcctcccaaattcaccctgtagccttgagatccaggctacgaagcttgaggattcagcagtgtatttttgtgccagcagccaatc
|
| 29 |
+
Homosap TRBV16 TRBV16*03 L26054 (F) ggtgaagaagtcgcccagactccaaaacatcttgtcagaggggaaggacagaaagcaaaattatattgtgccccaataaaaggacacagttatgttttttggtaccaacaggtcctgaaaaacgagttcaagttcttggtttccttccagaatgaaaatgtctttgatgaaacaggtatgcccaaggaaagattttcagctaagtgcctcccaaattcaccctgtagccttgagatccaggctacgaagcttgaggattcagcagtgtatttttgtgccagcagc
|
| 30 |
+
Homosap TRBV18 TRBV18*01 L36092 F aatgccggcgtcatgcagaacccaagacacctggtcaggaggaggggacaggaggcaagactgagatgcagcccaatgaaaggacacagtcatgtttactggtatcggcagctcccagaggaaggtctgaaattcatggtttatctccagaaagaaaatatcatagatgagtcaggaatgccaaaggaacgattttctgctgaatttcccaaagagggccccagcatcctgaggatccagcaggtagtgcgaggagattcggcagcttatttctgtgccagctcaccacc
|
| 31 |
+
Homosap TRBV19 TRBV19*01 L36092 F gatggtggaatcactcagtccccaaagtacctgttcagaaaggaaggacagaatgtgaccctgagttgtgaacagaatttgaaccacgatgccatgtactggtaccgacaggacccagggcaagggctgagattgatctactactcacagatagtaaatgactttcagaaaggagatatagctgaagggtacagcgtctctcgggagaagaaggaatcctttcctctcactgtgacatcggcccaaaagaacccgacagctttctatctctgtgccagtagtataga
|
| 32 |
+
Homosap TRBV19 TRBV19*02 U48259 F gatggtggaatcactcagtccccaaagtacctgttcagaaaggaaggacagaatgtgaccctgagttgtgaacagaatttgaaccacgatgccatgtactggtaccgacaggtcccagggcaagggctgagattgatctactactcacacatagtaaatgactttcagaaaggagatatagctgaagggtacagcgtctctcgggagaagaaggaatcctttcctctcactgtgacatcggcccaaaagaacccgacagctttctatctctgtgccagtagtataga
|
| 33 |
+
Homosap TRBV19 TRBV19*03 M97725 (F) gatggtggaatcactcagtccccaaagtacctgttcagaaaggaaggacagaatgtgaccctgagttgtgaacagaatttgaaccacgatgccatgtactggtaccgacaggacccagggcaagggctgagattgatctactactcacacatagtaaatgactttcagaaaggagatatagctgaagggtacagcgtctctcgggagaagaaggaatcctttcctctcactgtgacatcggcccaaaagaacccgacagctttctatctctgtgccagtag
|
| 34 |
+
Homosap TRBV2 TRBV2*01 L36092 F gaacctgaagtcacccagactcccagccatcaggtcacacagatgggacaggaagtgatcttgcgctgtgtccccatctctaatcacttatacttctattggtacagacaaatcttggggcagaaagtcgagtttctggtttccttttataataatgaaatctcagagaagtctgaaatattcgatgatcaattctcagttgaaaggcctgatggatcaaatttcactctgaagatccggtccacaaagctggaggactcagccatgtacttctgtgccagcagtgaagc
|
| 35 |
+
Homosap TRBV2 TRBV2*02 M62379 (F) gaacctgaagtcacccagactcccagccatcaggtcacacagatgggacaggaagtgatcttgcactgtgtccccatctctaatcacttatacttctattggtacagacaaatcttggggcagaaagtcgagtttctggtttccttttataataatgaaatctcagagaagtctgaaatattcgatgatcaattctcagttgaaaggcctgatggatcaaatttcactctgaagatccggtccacaaagctggaggactcagccatgtacttctgtgccagcagt
|
| 36 |
+
Homosap TRBV2 TRBV2*03 M64351 (F) gaacctgaagtcacccagactcccagccatcaggtcacacagatgggacaggaagtgatcttgcgctgtgtccccatctctaatcacttatacttctattggtacagacaaatcttggggcagaaagtcgagtttctggtttccttttataataatgaaatctcagagaagtctgaaatattcgatgatcaattctcagttgagaggcctgatggatcaaatttcactctgaagatccggtccacaaagctggaggactcagccatgtacttctgtgccagcagtgaa
|
| 37 |
+
Homosap TRBV20-1 TRBV20-1*01 M11955 F ggtgctgtcgtctctcaacatccgagctgggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaacagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgctagaga
|
| 38 |
+
Homosap TRBV20-1 TRBV20-1*02 X72719 F ggtgctgtcgtctctcaacatccgagcagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaacagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgct
|
| 39 |
+
Homosap TRBV20-1 TRBV20-1*03 M11954 (F) ggtgctgtcgtctctcaacatccgagctgggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaacagagtctcatgctgatggcaacttccaatgagggctgcaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgct
|
| 40 |
+
Homosap TRBV20-1 TRBV20-1*04 M14263 (F) ggtgctgtcgtctctcaacatccgagcagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccttggactttcaggccacaactatgttttggtatcgtcagttcccgaaaaagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgctagt
|
| 41 |
+
Homosap TRBV20-1 TRBV20-1*05 X57604 (F) ggtgctgtcgtctctcaacatccgagcagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaaaagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgctaga
|
| 42 |
+
Homosap TRBV20-1 TRBV20-1*06 D13088 (F) ggtgctgtcgtctctcaacatccgagtagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaaaagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgct
|
| 43 |
+
Homosap TRBV20-1 TRBV20-1*07 X74852 (F) ggtgctgtcgtctctcaacatccgagcagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaaaagagtctcatgcagatcgcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgctaga
|
| 44 |
+
Homosap TRBV24-1 TRBV24-1*01 M11951 F gatgctgatgttacccagaccccaaggaataggatcacaaagacaggaaagaggattatgctggaatgttctcagactaagggtcatgatagaatgtactggtatcgacaagacccaggactgggcctacggttgatctattactcctttgatgtcaaagatataaacaaaggagagatctctgatggatacagtgtctctcgacaggcacaggctaaattctccctgtccctagagtctgccatccccaaccagacagctctttacttctgtgccaccagtgatttg
|
| 45 |
+
Homosap TRBV24-1 TRBV24-1*02 IMGT000021 F gatgctgatgttacccagaccccaaggaataggatcacaaagacaggaaagaggattatgctggaatgttctcagactaagggtcatgatagaatgtactggtatcgacaagacccaggactgggcctacagttgatctattactcctttgatgtcaaagatataaacaaaggagagatctctgatggatacagtgtctctcgacaggcacaggctaaattctccctgtccctagagtctgccatccccaaccagacagctctttacttctgtgccaccagtgatttg
|
| 46 |
+
Homosap TRBV25-1 TRBV25-1*01 L36092 F gaagctgacatctaccagaccccaagataccttgttatagggacaggaaagaagatcactctggaatgttctcaaaccatgggccatgacaaaatgtactggtatcaacaagatccaggaatggaactacacctcatccactattcctatggagttaattccacagagaagggagatctttcctctgagtcaacagtctccagaataaggacggagcattttcccctgaccctggagtctgccaggccctcacatacctctcagtacctctgtgccagcagtgaata
|
| 47 |
+
Homosap TRBV27 TRBV27*01 L36092 F gaagcccaagtgacccagaacccaagatacctcatcacagtgactggaaagaagttaacagtgacttgttctcagaatatgaaccatgagtatatgtcctggtatcgacaagacccagggctgggcttaaggcagatctactattcaatgaatgttgaggtgactgataagggagatgttcctgaagggtacaaagtctctcgaaaagagaagaggaatttccccctgatcctggagtcgcccagccccaaccagacctctctgtacttctgtgccagcagtttatc
|
| 48 |
+
Homosap TRBV28 TRBV28*01 U08314 F gatgtgaaagtaacccagagctcgagatatctagtcaaaaggacgggagagaaagtttttctggaatgtgtccaggatatggaccatgaaaatatgttctggtatcgacaagacccaggtctggggctacggctgatctatttctcatatgatgttaaaatgaaagaaaaaggagatattcctgaggggtacagtgtctctagagagaagaaggagcgcttctccctgattctggagtccgccagcaccaaccagacatctatgtacctctgtgccagcagtttatg
|
| 49 |
+
Homosap TRBV29-1 TRBV29-1*01 L36092 F agtgctgtcatctctcaaaagccaagcagggatatctgtcaacgtggaacctccctgacgatccagtgtcaagtcgatagccaagtcaccatgatgttctggtaccgtcagcaacctggacagagcctgacactgatcgcaactgcaaatcagggctctgaggccacatatgagagtggatttgtcattgacaagtttcccatcagccgcccaaacctaacattctcaactctgactgtgagcaacatgagccctgaagacagcagcatatatctctgcagcgttgaaga
|
| 50 |
+
Homosap TRBV29-1 TRBV29-1*02 M13847 (F) agtgctgtcatctctcaaaagccaagcagggatatctgtcaacgtggaacctccctgacgatccagtgtcaagtcgatagccaagtcaccatgatgttctggtaccgtcagcaacctggacagagcctgacactgatcgcaactgcaaatcagggctctgaggccacatatgagagtggatttgtcattgacaagtttcccatcagccgcccaaacctaacattctcaagtctgactgtgagcaacatgagccctgaagacagcagcatatatctctgcagcgttgaa
|
| 51 |
+
Homosap TRBV3-1 TRBV3-1*01 U07977 F gacacagctgtttcccagactccaaaatacctggtcacacagatgggaaacgacaagtccattaaatgtgaacaaaatctgggccatgatactatgtattggtataaacaggactctaagaaatttctgaagataatgtttagctacaataataaggagctcattataaatgaaacagttccaaatcgcttctcacctaaatctccagacaaagctcacttaaatcttcacatcaattccctggagcttggtgactctgctgtgtatttctgtgccagcagccaaga
|
| 52 |
+
Homosap TRBV3-1 TRBV3-1*02 L06889 (F) gacacagctgtttcccagactccaaaatacctggtcacacagatgggaaacgacaagtccattaaatgtgaacaaaatctgggccatgatactatgtattggtataaacaggactctaagaaatttctgaagataatgtttagctacaataacaaggagatcattataaatgaaacagttccaaatcgattctcacctaaatctccagacaaagctaaattaaatcttcacatcaattccctggagcttggtgactctgctgtgtatttctgtgccagc
|
| 53 |
+
Homosap TRBV30 TRBV30*01 L36092 F tctcagactattcatcaatggccagcgaccctggtgcagcctgtgggcagcccgctctctctggagtgcactgtggagggaacatcaaaccccaacctatactggtaccgacaggctgcaggcaggggcctccagctgctcttctactccgttggtattggccagatcagctctgaggtgccccagaatctctcagcctccagaccccaggaccggcagttcatcctgagttctaagaagctccttctcagtgactctggcttctatctctgtgcctggagtgt
|
| 54 |
+
Homosap TRBV30 TRBV30*02 Z13967 F tctcagactattcatcaatggccagcgaccctggtgcagcctgtgggcagcccgctctctctggagtgcactgtggagggaacatcaaaccccaacctatactggtaccgacaggctgcaggcaggggcctccagctgctcttctactccgttggtattggccagatcagctctgaggtgccccagaatctctcagcctccagaccccaggaccggcagttcatcctgagttctaagaagctcctcctcagtgactctggcttctatctctgtgcctggagtgt
|
| 55 |
+
Homosap TRBV30 TRBV30*05 L06893 (F) tctcagactattcatcaatggccagcgaccctggtgcagcctgtgggcagcccgctctccctggagtgcactgtggagggaacatcaaaccccaacctatactggtaccgacaggctgcaggacggggcctccagctgctcttctactccgttggtattggccagatcagctctgaggtgccccagaatctctcagcctccagaccccaggaccggcagttcatcctgagttctaagaagctccttctcagtgactctggcttctatctctgtgcctgggga
|
| 56 |
+
Homosap TRBV4-1 TRBV4-1*01 U07977 F gacactgaagttacccagacaccaaaacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatatggggcacagggctatgtattggtacaagcagaaagctaagaagccaccggagctcatgtttgtctacagctatgagaaactctctataaatgaaagtgtgccaagtcgcttctcacctgaatgccccaacagctctctcttaaaccttcacctacacgccctgcagccagaagactcagccctgtatctctgcgccagcagccaaga
|
| 57 |
+
Homosap TRBV4-2 TRBV4-2*01 U07975 F gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctggggcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacaactttaaagaacagactgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttattccttcacctacacaccctgcagccagaagactcggccctgtatctctgtgccagcagccaaga
|
| 58 |
+
Homosap TRBV4-2 TRBV4-2*02 X58811 (F) gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctggggcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacaactttaaagaacagactgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttatgccttcacctacacaccctgcagccagaagactcggccctgtatctctgtgccagcacc
|
| 59 |
+
Homosap TRBV4-3 TRBV4-3*01 U07978 F gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctgggtcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacagtcttgaagaacgggttgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttattccttcacctacacaccctgcagccagaagactcggccctgtatctctgcgccagcagccaaga
|
| 60 |
+
Homosap TRBV4-3 TRBV4-3*02 X58812 (F) gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctgggtcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacagtcttgaagaacgggttgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttatcccttcacctacacaccctgcagccagaagactcggccctgtatctctgcgccagcagc
|
| 61 |
+
Homosap TRBV4-3 TRBV4-3*03 L06888 (F) gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctgggtcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacagtcttgaagaacgtgttgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttattccttcacctacacaccctgcagccagaagactcggccctgtatctctgcgccagcagc
|
| 62 |
+
Homosap TRBV5-1 TRBV5-1*01 L36092 F aaggctggagtcactcaaactccaagatatctgatcaaaacgagaggacagcaagtgacactgagctgctcccctatctctgggcataggagtgtatcctggtaccaacagaccccaggacagggccttcagttcctctttgaatacttcagtgagacacagagaaacaaaggaaacttccctggtcgattctcagggcgccagttctctaactctcgctctgagatgaatgtgagcaccttggagctgggggactcggccctttatctttgcgccagcagcttgg
|
| 63 |
+
Homosap TRBV5-1 TRBV5-1*02 M14271 (F) agggctggggtcactcaaactccaagacatctgatcaaaacgagaggacagcaagtgacactgggctgctcccctatctctgggcataggagtgtatcctggtaccaacagaccctaggacagggccttcagttcctctttgaatacttcagtgagacacagagaaacaaaggaaacttccttggtcgattctcagggcgccagttctctaactctcgctctgagatgaatgtgagcaccttggagctgggggactcggccctttatctttgcgccagc
|
| 64 |
+
Homosap TRBV5-4 TRBV5-4*01 L36092 F gagactggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctcttctcagtctgggcacaacactgtgtcctggtaccaacaggccctgggtcaggggccccagtttatctttcagtattatagggaggaagagaatggcagaggaaacttccctcctagattctcaggtctccagttccctaattatagctctgagctgaatgtgaacgccttggagctggacgactcggccctgtatctctgtgccagcagcttgg
|
| 65 |
+
Homosap TRBV5-4 TRBV5-4*02 X57615 (F) gagactggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctcttctcagtctgggcacaacactgtgtcctggtaccaacaggccctgggtcaggggccccagtttatctttcagtattatagggaggaagagaatggcagaggaaacttccctcctagattctcaggtctccagttccctaattataactctgagctgaatgtgaacgccttggagctggacgactcggccctgtatctctgtgccagcagc
|
| 66 |
+
Homosap TRBV5-5 TRBV5-5*01 L36092 F gacgctggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctctcctatctctgggcacaagagtgtgtcctggtaccaacaggtcctgggtcaggggccccagtttatctttcagtattatgagaaagaagagagaggaagaggaaacttccctgatcgattctcagctcgccagttccctaactatagctctgagctgaatgtgaacgccttgttgctgggggactcggccctgtatctctgtgccagcagcttgg
|
| 67 |
+
Homosap TRBV5-5 TRBV5-5*02 X57611 (F) gacgctggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcacgtgactctgagatgctctcctatctctgggcacaagagtgtgtcctggtaccaacaggtcctgggtcaggggccccagtttatctttcagtattatgagaaagaagagagaggaagaggaaacttccctgatcgattctcagctcgccagttccctaactatagctctgagctgaatgtgaacgccttgttgctgggggactcggccctgtatctctgtgccagcagc
|
| 68 |
+
Homosap TRBV5-5 TRBV5-5*03 X58801 (F) gacgctggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctctcctatctctgagcacaagagtgtgtcctggtaccaacaggtcctgggtcaggggccccagtttatctttcagtattatgagaaagaagagagaggaagaggaaacttccctgatcgattctcagctcgccagttccctaactatagctctgagctgaatgtgaacgccttgttgctgggggactcggccctgtatctctgtgccagcagc
|
| 69 |
+
Homosap TRBV5-6 TRBV5-6*01 L36092 F gacgctggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctctcctaagtctgggcatgacactgtgtcctggtaccaacaggccctgggtcaggggccccagtttatctttcagtattatgaggaggaagagagacagagaggcaacttccctgatcgattctcaggtcaccagttccctaactatagctctgagctgaatgtgaacgccttgttgctgggggactcggccctctatctctgtgccagcagcttgg
|
| 70 |
+
Homosap TRBV5-8 TRBV5-8*01 L36092 F gaggctggagtcacacaaagtcccacacacctgatcaaaacgagaggacagcaagcgactctgagatgctctcctatctctgggcacaccagtgtgtactggtaccaacaggccctgggtctgggcctccagttcctcctttggtatgacgagggtgaagagagaaacagaggaaacttccctcctagattttcaggtcgccagttccctaattatagctctgagctgaatgtgaacgccttggagctggaggactcggccctgtatctctgtgccagcagcttgg
|
| 71 |
+
Homosap TRBV6-1 TRBV6-1*01 X61446 F aatgctggtgtcactcagaccccaaaattccaggtcctgaagacaggacagagcatgacactgcagtgtgcccaggatatgaaccataactccatgtactggtatcgacaagacccaggcatgggactgaggctgatttattactcagcttctgagggtaccactgacaaaggagaagtccccaatggctacaatgtctccagattaaacaaacgggagttctcgctcaggctggagtcggctgctccctcccagacatctgtgtacttctgtgccagcagtgaagc
|
| 72 |
+
Homosap TRBV6-2 TRBV6-2*01 X61445 F aatgctggtgtcactcagaccccaaaattccgggtcctgaagacaggacagagcatgacactgctgtgtgcccaggatatgaaccatgaatacatgtactggtatcgacaagacccaggcatggggctgaggctgattcattactcagttggtgagggtacaactgccaaaggagaggtccctgatggctacaatgtctccagattaaaaaaacagaatttcctgctggggttggagtcggctgctccctcccaaacatctgtgtacttctgtgccagcagttactc
|
| 73 |
+
Homosap TRBV6-3 TRBV6-3*01 U07978 F aatgctggtgtcactcagaccccaaaattccgggtcctgaagacaggacagagcatgacactgctgtgtgcccaggatatgaaccatgaatacatgtactggtatcgacaagacccaggcatggggctgaggctgattcattactcagttggtgagggtacaactgccaaaggagaggtccctgatggctacaatgtctccagattaaaaaaacagaatttcctgctggggttggagtcggctgctccctcccaaacatctgtgtacttctgtgccagcagttactc
|
| 74 |
+
Homosap TRBV6-4 TRBV6-4*01 X61653 F attgctgggatcacccaggcaccaacatctcagatcctggcagcaggacggcgcatgacactgagatgtacccaggatatgagacataatgccatgtactggtatagacaagatctaggactggggctaaggctcatccattattcaaatactgcaggtaccactggcaaaggagaagtccctgatggttatagtgtctccagagcaaacacagatgatttccccctcacgttggcgtctgctgtaccctctcagacatctgtgtacttctgtgccagcagtgactc
|
| 75 |
+
Homosap TRBV6-4 TRBV6-4*02 AF009660 F actgctgggatcacccaggcaccaacatctcagatcctggcagcaggacggagcatgacactgagatgtacccaggatatgagacataatgccatgtactggtatagacaagatctaggactggggctaaggctcatccattattcaaatactgcaggtaccactggcaaaggagaagtccctgatggttatagtgtctccagagcaaacacagatgatttccccctcacgttggcgtctgctgtaccctctcagacatctgtgtacttctgtgccagcagtgactc
|
| 76 |
+
Homosap TRBV6-5 TRBV6-5*01 L36092 F aatgctggtgtcactcagaccccaaaattccaggtcctgaagacaggacagagcatgacactgcagtgtgcccaggatatgaaccatgaatacatgtcctggtatcgacaagacccaggcatggggctgaggctgattcattactcagttggtgctggtatcactgaccaaggagaagtccccaatggctacaatgtctccagatcaaccacagaggatttcccgctcaggctgctgtcggctgctccctcccagacatctgtgtacttctgtgccagcagttactc
|
| 77 |
+
Homosap TRBV6-6 TRBV6-6*01 L36092 F aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtacccaggatatgaaccataactacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgataaaggagaagtcccgaatggctacaacgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctccctcccagacatctgtgtacttctgtgccagcagttactc
|
| 78 |
+
Homosap TRBV6-6 TRBV6-6*02 AF009662 F aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtgcccaggatatgaaccataactacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgacaaaggagaagtcccgaatggctacaacgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctccctcccagacatctgtgtacttctgtgccagcagttactc
|
| 79 |
+
Homosap TRBV6-6 TRBV6-6*03 X58815 (F) aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtgcccaggatatgaaccataactacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgataaaggagaagtcccgaatggctacaacgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctccctcccagacatctgtgtacttctgtgccagcagt
|
| 80 |
+
Homosap TRBV6-6 TRBV6-6*04 X74848 (F) aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtacccaggatatgaaccatgaatacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgataaaggagaagtcccgaatggctacaatgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctccctcccagacatctgtgtacttctgtgccagcagtcga
|
| 81 |
+
Homosap TRBV6-6 TRBV6-6*05 L06892 (F) aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtgcccaggatatgaaccataactacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgacaaaggagaagtcccgaatggctacaacgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctgcctcccagacatctgtgtacttctgtgccagcagc
|
| 82 |
+
Homosap TRBV6-8 TRBV6-8*01 L36092 F aatgctggtgtcactcagaccccaaaattccacatcctgaagacaggacagagcatgacactgcagtgtgcccaggatatgaaccatggatacatgtcctggtatcgacaagacccaggcatggggctgagactgatttactactcagctgctgctggtactactgacaaagaagtccccaatggctacaatgtctctagattaaacacagaggatttcccactcaggctggtgtcggctgctccctcccagacatctgtgtacttgtgtgccagcagttactc
|
| 83 |
+
Homosap TRBV6-9 TRBV6-9*01 X61447 F aatgctggtgtcactcagaccccaaaattccacatcctgaagacaggacagagcatgacactgcagtgtgcccaggatatgaaccatggatacttgtcctggtatcgacaagacccaggcatggggctgaggcgcattcattactcagttgctgctggtatcactgacaaaggagaagtccccgatggctacaatgtatccagatcaaacacagaggatttcccgctcaggctggagtcagctgctccctcccagacatctgtatacttctgtgccagcagttattc
|
| 84 |
+
Homosap TRBV7-2 TRBV7-2*01 X61442 F ggagctggagtctcccagtcccccagtaacaaggtcacagagaagggaaaggatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacagagcctggggcagggcctggagtttttaatttacttccaaggcaacagtgcaccagacaaatcagggctgcccagtgatcgcttctctgcagagaggactgggggatccgtctccactctgacgatccagcgcacacagcaggaggactcggccgtgtatctctgtgccagcagcttagc
|
| 85 |
+
Homosap TRBV7-2 TRBV7-2*02 L36190 F ggagctggagtctcccagtcccccagtaacaaggtcacagagaagggaaaggatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacagaggctggggcagggcctggagtttttaatttacttccaaggcaacagtgcaccagacaaatcagggctgcccagtgatcgcttctctgcagagaggactggggaatccgtctccactctgacgatccagcgcacacagcaggaggactcggccgtgtatctctgtgccagcagcttagc
|
| 86 |
+
Homosap TRBV7-2 TRBV7-2*03 U07975 F ggagctggagtctcccagtcccccagtaacaaggtcacagagaagggaaaggatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacagaggctggggcagggcctggagtttttaatttacttccaaggcaacagtgcaccagacaaatcagggctgcccagtgatcgcttctctgcagagaggactggggaatccgtctccactctgacgatccagcgcacacagcaggaggactcggccgtgtatctctgtaccagcagcttagc
|
| 87 |
+
Homosap TRBV7-2 TRBV7-2*04 M27387 (F) ggagctggagtttcccagtcccccagtaacaaggtcacagagaagggaaaggatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacagagcctggggcagggcctggagtttttaatttacttccaaggcaacagtgcaccagacaaatcagggctgcccagtgatcgcttctctgcagagaggactgggggatccgtctccactctgacgatccagcgcacacagcaggaggactcggccgtgtatctctgtgccagcagcttag
|
| 88 |
+
Homosap TRBV7-3 TRBV7-3*01 X61440 F ggtgctggagtctcccagacccccagtaacaaggtcacagagaagggaaaatatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacaaagcctggggcagggcccagagtttctaatttacttccaaggcacgggtgcggcagatgactcagggctgcccaacgatcggttctttgcagtcaggcctgagggatccgtctctactctgaagatccagcgcacagagcggggggactcagccgtgtatctctgtgccagcagcttaac
|
| 89 |
+
Homosap TRBV7-3 TRBV7-3*04 X74843 (F) ggtgctggagtctcccagacccccagtaacaaggtcacagagaagggaaaatatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacaaagcctggggcagggcccagagtttctaatttacttccaaggcacgggtgcggcagatgactcagggctgcccaacgatcggttctttgcagtcaggcctgagggatccgtctctactctgaagatccagcgcacagagcggggggactctgccgtgtatctctgtgccagcagctt
|
| 90 |
+
Homosap TRBV7-4 TRBV7-4*01 L36092 F ggtgctggagtctcccagtccccaaggtacaaagtcgcaaagaggggacgggatgtagctctcaggtgtgattcaatttcgggtcatgtaaccctttattggtaccgacagaccctggggcagggctcagaggttctgacttactcccagagtgatgctcaacgagacaaatcagggcggcccagtggtcggttctctgcagagaggcctgagagatccgtctccactctgaagatccagcgcacagagcagggggactcagctgtgtatctctgtgccagcagcttagc
|
| 91 |
+
Homosap TRBV7-6 TRBV7-6*01 L36092 F ggtgctggagtctcccagtctcccaggtacaaagtcacaaagaggggacaggatgtagctctcaggtgtgatccaatttcgggtcatgtatccctttattggtaccgacaggccctggggcagggcccagagtttctgacttacttcaattatgaagcccaacaagacaaatcagggctgcccaatgatcggttctctgcagagaggcctgagggatccatctccactctgacgatccagcgcacagagcagcgggactcggccatgtatcgctgtgccagcagcttagc
|
| 92 |
+
Homosap TRBV7-6 TRBV7-6*02 X58806 (F) ggtgctggagtctcccagtctcccaggtacaaagtcacaaagaggggacaggatgtagctctcaggtgtgatccaatctcgggtcatgtatccctttattggtaccgacaggccctggggcagggcccagagtttctgacttacttcaattatgaagcccaacaagacaaatcagggctgcccaatgatcggttctctgcagagaggcctgagggatccatctccactctgacgatccagcgcacagagcagcgggactcggccatgtatcgctgtgccagcagc
|
| 93 |
+
Homosap TRBV7-7 TRBV7-7*01 L36092 F ggtgctggagtctcccagtctcccaggtacaaagtcacaaagaggggacaggatgtaactctcaggtgtgatccaatttcgagtcatgcaaccctttattggtatcaacaggccctggggcagggcccagagtttctgacttacttcaattatgaagctcaaccagacaaatcagggctgcccagtgatcggttctctgcagagaggcctgagggatccatctccactctgacgattcagcgcacagagcagcgggactcagccatgtatcgctgtgccagcagcttagc
|
| 94 |
+
Homosap TRBV7-7 TRBV7-7*02 X57607 (F) ggtgctggagtctcccagtctcccaggtacaaagtcacaaagaggggacaggatgtaactctcaggtgtgatccaatttcgagtcatgtaaccctttattggtatcaacaggccctggggcagggcccagagtttctgacttacttcaattatgaagctcaaccagacaaatcagggctgcccagtgatcggttctctgcagagaggcctgagggatccatctccactctgacgattcagcgcacagagcagcgggactcagccatgtatcgctgtgccagcagc
|
| 95 |
+
Homosap TRBV7-8 TRBV7-8*01 M11953 F ggtgctggagtctcccagtcccctaggtacaaagtcgcaaagagaggacaggatgtagctctcaggtgtgatccaatttcgggtcatgtatcccttttttggtaccaacaggccctggggcaggggccagagtttctgacttatttccagaatgaagctcaactagacaaatcggggctgcccagtgatcgcttctttgcagaaaggcctgagggatccgtctccactctgaagatccagcgcacacagcaggaggactccgccgtgtatctctgtgccagcagcttagc
|
| 96 |
+
Homosap TRBV7-8 TRBV7-8*02 X61441 F ggtgctggagtctcccagtcccctaggtacaaagtcgcaaagagaggacaggatgtagctctcaggtgtgatccaatttcgggtcatgtatcccttttttggtaccaacaggccctggggcaggggccagagtttctgacttatttccagaatgaagctcaactagacaaatcggggctgcccagtgatcgcttctttgcagaaaggcctgagggatccgtctccactctgaagatccagcgcacacagaaggaggactccgccgtgtatctctgtgccagcagcttagc
|
| 97 |
+
Homosap TRBV7-8 TRBV7-8*03 M27384 (F) ggtgctggagtctcccagtcccctaggtacaaagtcgcaaagagaggacaggatgtagctctcaggtgtgatccaatttcgggtcatgtatcccttttttggtaccaacaggccctcgggcaggggccagagtttctgacttatttccagaatgaagctcaactagacaaatcggggctgcccagtgatcgcttctttgcagaaaggcctgagggatccgtctccactctgaagatccagcgcacacagcaggaggactccgccgtgtatctctgtgccagcagccga
|
| 98 |
+
Homosap TRBV7-9 TRBV7-9*01 L36092 F gatactggagtctcccagaaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctttctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcagcttagc
|
| 99 |
+
Homosap TRBV7-9 TRBV7-9*02 M15564 (F) gatactggagtctcccagaaccccagacacaacatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctttctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcagcttagc
|
| 100 |
+
Homosap TRBV7-9 TRBV7-9*03 AF009663 F gatactggagtctcccaggaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctttctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcagcttagc
|
| 101 |
+
Homosap TRBV7-9 TRBV7-9*04 M14261 (F) atatctggagtctcccacaaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaaccctgggcagggcccagagtttctgacttacttccagaatgaagctcaactggaaaaatcagggctgctcagtgatcggatctctgcagagaggcctaagggatctttctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcagc
|
| 102 |
+
Homosap TRBV7-9 TRBV7-9*05 M27385 (F) gatactggagtctcccagaaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctctctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcaccaaa
|
| 103 |
+
Homosap TRBV7-9 TRBV7-9*06 X74844 (F) gatactggagtctcccagaaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctctttccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcacgttg
|
| 104 |
+
Homosap TRBV9 TRBV9*01 L36092 F gattctggagtcacacaaaccccaaagcacctgatcacagcaactggacagcgagtgacgctgagatgctcccctaggtctggagacctctctgtgtactggtaccaacagagcctggaccagggcctccagttcctcattcagtattataatggagaagagagagcaaaaggaaacattcttgaacgattctccgcacaacagttccctgacttgcactctgaactaaacctgagctctctggagctgggggactcagctttgtatttctgtgccagcagcgtag
|
| 105 |
+
Homosap TRBV9 TRBV9*02 AF009660 F gattctggagtcacacaaaccccaaagcacctgatcacagcaactggacagcgagtgacgctgagatgctcccctaggtctggagacctctctgtgtactggtaccaacagagcctggaccagggcctccagttcctcattcactattataatggagaagagagagcaaaaggaaacattcttgaacgattctccgcacaacagttccctgacttgcactctgaactaaacctgagctctctggagctgggggactcagctttgtatttctgtgccagcagcgtag
|
| 106 |
+
Homosap TRBV9 TRBV9*03 M27380 (F) gattctggagtcacacaaaccccaaagcacctgatcacagcaactggacagcgagtgacgctgagatgctcccctaggtctggagacctctctgtgtactggtaccaacagagcctggaccagggcctccagttcctcattcaatattataatggagaagagagagcaaaaggaaacattcttgaacgattctccgcacaacagttccctgacttgcactctgaactaaacctgagctctctggagctgggggactcagctttgtatttctgtgccagcagc
|
src/main.py
ADDED
|
@@ -0,0 +1,1423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from typing import Dict, List, Tuple, Optional
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
from collections import Counter
|
| 9 |
+
from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score
|
| 10 |
+
import warnings
|
| 11 |
+
from model import negative_sampling_phla
|
| 12 |
+
warnings.filterwarnings("ignore")
|
| 13 |
+
|
| 14 |
+
from physicochemical import PhysicochemicalEncoder
|
| 15 |
+
|
| 16 |
+
from model import (
|
| 17 |
+
ESM2Encoder,
|
| 18 |
+
ESMFoldEncoder,
|
| 19 |
+
PeptideHLABindingPredictor,
|
| 20 |
+
PepHLA_Dataset,
|
| 21 |
+
peptide_hla_collate_fn,
|
| 22 |
+
TCRPeptideHLABindingPredictor,
|
| 23 |
+
TCRPepHLA_Dataset,
|
| 24 |
+
tcr_pep_hla_collate_fn,
|
| 25 |
+
EarlyStopping
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# ============================================================================
|
| 29 |
+
# Utility functions
|
| 30 |
+
# ============================================================================
|
| 31 |
+
|
| 32 |
+
def load_train_data(
|
| 33 |
+
df_train_list: List[pd.DataFrame],
|
| 34 |
+
df_val_list: List[pd.DataFrame],
|
| 35 |
+
hla_dict_path: str = 'pMHC/HLA_dict.npy',
|
| 36 |
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 37 |
+
"""
|
| 38 |
+
Load training and validation datasets only.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
hla_dict_path: Path to HLA dictionary
|
| 42 |
+
train_folds: List of training fold indices
|
| 43 |
+
val_folds: List of validation fold indices
|
| 44 |
+
sample_frac: Fraction of data to sample (for quick testing)
|
| 45 |
+
seed: Random seed
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
df_train, df_val
|
| 49 |
+
"""
|
| 50 |
+
print("Loading training and validation data...")
|
| 51 |
+
|
| 52 |
+
# Load HLA dictionary
|
| 53 |
+
HLA_dict = np.load(hla_dict_path, allow_pickle=True).item()
|
| 54 |
+
|
| 55 |
+
# Process HLA names → full sequence
|
| 56 |
+
for df in df_train_list + df_val_list:
|
| 57 |
+
df['HLA'] = df['HLA'].apply(lambda x: x[4:] if x.startswith('HLA-') else x)
|
| 58 |
+
df['HLA_full'] = df['HLA'].apply(lambda x: HLA_dict[x])
|
| 59 |
+
|
| 60 |
+
return df_train_list, df_val_list
|
| 61 |
+
|
| 62 |
+
def load_test_data(
|
| 63 |
+
df_test: pd.DataFrame,
|
| 64 |
+
hla_dict_path: str = 'pMHC/HLA_dict.npy'
|
| 65 |
+
) -> pd.DataFrame:
|
| 66 |
+
"""
|
| 67 |
+
Preprocess a given test DataFrame (e.g. independent or external set).
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
df_test: Test dataframe with at least 'HLA', 'peptide', 'label'
|
| 71 |
+
hla_dict_path: Path to HLA dictionary (to map HLA name to full sequence)
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Processed df_test with 'HLA_full' added
|
| 75 |
+
"""
|
| 76 |
+
print("Processing test data...")
|
| 77 |
+
|
| 78 |
+
HLA_dict = np.load(hla_dict_path, allow_pickle=True).item()
|
| 79 |
+
|
| 80 |
+
df_test = df_test.copy()
|
| 81 |
+
df_test['HLA'] = df_test['HLA'].apply(lambda x: x[4:] if x.startswith('HLA-') else x)
|
| 82 |
+
df_test['HLA_full'] = df_test['HLA'].apply(lambda x: HLA_dict[x])
|
| 83 |
+
|
| 84 |
+
print(f"✓ Test set: {len(df_test)} samples")
|
| 85 |
+
return df_test
|
| 86 |
+
|
| 87 |
+
class StriMap_pHLA:
|
| 88 |
+
"""
|
| 89 |
+
StriMap for Structure-informed Peptide-HLA Binding Prediction Model
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
def __init__(
|
| 93 |
+
self,
|
| 94 |
+
device: str = 'cuda:0',
|
| 95 |
+
model_save_path: str = 'model_params/best_model_phla.pt',
|
| 96 |
+
pep_dim: int = 256,
|
| 97 |
+
hla_dim: int = 256,
|
| 98 |
+
bilinear_dim: int = 256,
|
| 99 |
+
loss_fn: str = 'bce',
|
| 100 |
+
alpha: float = 0.5,
|
| 101 |
+
gamma: float = 2.0,
|
| 102 |
+
esm2_layer: int = 33,
|
| 103 |
+
batch_size: int = 256,
|
| 104 |
+
esmfold_cache_dir: str = "esm_cache",
|
| 105 |
+
cache_dir: str = 'phla_cache',
|
| 106 |
+
cache_save: bool = True,
|
| 107 |
+
seed: int = 1,
|
| 108 |
+
pos_weights: Optional[float] = None
|
| 109 |
+
):
|
| 110 |
+
"""
|
| 111 |
+
Initialize StriMap model
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
device: Device for computation
|
| 115 |
+
cache_dir: Directory for caching embeddings
|
| 116 |
+
model_save_path: Path to save best model
|
| 117 |
+
pep_dim: Peptide embedding dimension
|
| 118 |
+
hla_dim: HLA embedding dimension
|
| 119 |
+
bilinear_dim: Bilinear attention dimension
|
| 120 |
+
loss_fn: Loss function ('bce' or 'focal')
|
| 121 |
+
alpha: Alpha parameter for focal loss
|
| 122 |
+
gamma: Gamma parameter for focal loss
|
| 123 |
+
esm2_layer: ESM2 layer to extract features from
|
| 124 |
+
esmfold_cache_dir: Cache directory for ESMFold
|
| 125 |
+
cache_dir: Directory for caching embeddings
|
| 126 |
+
seed: Random seed
|
| 127 |
+
"""
|
| 128 |
+
self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
|
| 129 |
+
self.model_save_path = model_save_path
|
| 130 |
+
if not os.path.exists(os.path.dirname(model_save_path)) and os.path.dirname(model_save_path) != '':
|
| 131 |
+
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
|
| 132 |
+
self.seed = seed
|
| 133 |
+
self.cache_save = cache_save
|
| 134 |
+
self.batch_size = batch_size
|
| 135 |
+
self.loss_fn_name = loss_fn
|
| 136 |
+
self.alpha = alpha
|
| 137 |
+
self.gamma = gamma
|
| 138 |
+
self.pos_weights = pos_weights
|
| 139 |
+
|
| 140 |
+
# Set random seeds
|
| 141 |
+
self._set_seed(seed)
|
| 142 |
+
|
| 143 |
+
# Initialize encoders
|
| 144 |
+
print("Initializing encoders...")
|
| 145 |
+
self.phys_encoder = PhysicochemicalEncoder(device=self.device)
|
| 146 |
+
self.esm2_encoder = ESM2Encoder(device=str(self.device), layer=esm2_layer, cache_dir=cache_dir)
|
| 147 |
+
self.esmfold_encoder = ESMFoldEncoder(esm_cache_dir=esmfold_cache_dir, cache_dir=cache_dir)
|
| 148 |
+
|
| 149 |
+
# Initialize model
|
| 150 |
+
print("Initializing binding prediction model...")
|
| 151 |
+
self.model = PeptideHLABindingPredictor(
|
| 152 |
+
pep_dim=pep_dim,
|
| 153 |
+
hla_dim=hla_dim,
|
| 154 |
+
bilinear_dim=bilinear_dim,
|
| 155 |
+
loss_fn=self.loss_fn_name,
|
| 156 |
+
alpha=self.alpha,
|
| 157 |
+
gamma=self.gamma,
|
| 158 |
+
device=str(self.device),
|
| 159 |
+
pos_weights=self.pos_weights
|
| 160 |
+
).to(self.device)
|
| 161 |
+
|
| 162 |
+
# Embeddings cache
|
| 163 |
+
self.phys_dict = None
|
| 164 |
+
self.esm2_dict = None
|
| 165 |
+
self.struct_dict = None
|
| 166 |
+
|
| 167 |
+
print(f"✓ StriMap initialized on {self.device}")
|
| 168 |
+
|
| 169 |
+
def _set_seed(self, seed: int):
|
| 170 |
+
"""Set random seeds for reproducibility"""
|
| 171 |
+
np.random.seed(seed)
|
| 172 |
+
torch.manual_seed(seed)
|
| 173 |
+
torch.cuda.manual_seed(seed)
|
| 174 |
+
torch.cuda.manual_seed_all(seed)
|
| 175 |
+
torch.backends.cudnn.benchmark = False
|
| 176 |
+
torch.backends.cudnn.deterministic = True
|
| 177 |
+
|
| 178 |
+
def prepare_embeddings(
|
| 179 |
+
self,
|
| 180 |
+
df: pd.DataFrame,
|
| 181 |
+
force_recompute: bool = False,
|
| 182 |
+
):
|
| 183 |
+
"""
|
| 184 |
+
Prepare all embeddings (physicochemical, ESM2, structure)
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
df: DataFrame containing 'peptide' and 'HLA_full' columns
|
| 188 |
+
force_recompute: Force recomputation even if cache exists
|
| 189 |
+
incremental: If True, only compute missing sequences
|
| 190 |
+
phys_cache: Physicochemical embeddings cache file
|
| 191 |
+
esm2_cache: ESM2 embeddings cache file
|
| 192 |
+
struct_cache: Structure embeddings cache file
|
| 193 |
+
"""
|
| 194 |
+
|
| 195 |
+
# Extract unique sequences
|
| 196 |
+
all_peptides = sorted(set(df['peptide'].astype(str)))
|
| 197 |
+
all_hlas = sorted(set(df['HLA_full'].astype(str)))
|
| 198 |
+
|
| 199 |
+
print(f"\n{'='*70}")
|
| 200 |
+
print(f"Preparing embeddings for:")
|
| 201 |
+
print(f" - {len(all_peptides)} unique peptides")
|
| 202 |
+
print(f" - {len(all_hlas)} unique HLAs")
|
| 203 |
+
print(f"{'='*70}\n")
|
| 204 |
+
|
| 205 |
+
# ========================================================================
|
| 206 |
+
# 1. Physicochemical features
|
| 207 |
+
# ========================================================================
|
| 208 |
+
self.phys_dict = {
|
| 209 |
+
'pep': self._encode_phys(all_peptides),
|
| 210 |
+
'hla': self._encode_phys(all_hlas)
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
# ========================================================================
|
| 214 |
+
# 2. ESM2 embeddings
|
| 215 |
+
# ========================================================================
|
| 216 |
+
self.esm2_dict = {
|
| 217 |
+
'pep': self._encode_esm2(all_peptides, prefix='pep', re_embed=force_recompute),
|
| 218 |
+
'hla': self._encode_esm2(all_hlas, prefix='hla', re_embed=force_recompute)
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
# ========================================================================
|
| 222 |
+
# 3. Structure features (only for HLA)
|
| 223 |
+
# ========================================================================
|
| 224 |
+
self.struct_dict = self._encode_structure(all_hlas)
|
| 225 |
+
|
| 226 |
+
# ========================================================================
|
| 227 |
+
# Summary
|
| 228 |
+
# ========================================================================
|
| 229 |
+
print(f"{'='*70}")
|
| 230 |
+
print("✓ All embeddings prepared!")
|
| 231 |
+
print(f" - Phys: {len(self.phys_dict['pep'])} peptides, {len(self.phys_dict['hla'])} HLAs")
|
| 232 |
+
print(f" - ESM2: {len(self.esm2_dict['pep'])} peptides, {len(self.esm2_dict['hla'])} HLAs")
|
| 233 |
+
print(f" - Struct: {len(self.struct_dict)} HLAs")
|
| 234 |
+
print(f"{'='*70}\n")
|
| 235 |
+
|
| 236 |
+
def _encode_phys(self,
|
| 237 |
+
sequences: List[str]) -> Dict[str, torch.Tensor]:
|
| 238 |
+
"""Encode physicochemical properties"""
|
| 239 |
+
emb_dict = {}
|
| 240 |
+
|
| 241 |
+
for i in tqdm(range(0, len(sequences), self.batch_size), desc="Phys encoding"):
|
| 242 |
+
batch = sequences[i:i+self.batch_size]
|
| 243 |
+
embs = self.phys_encoder(batch).cpu() # [B, L, D]
|
| 244 |
+
for seq, emb in zip(batch, embs):
|
| 245 |
+
emb_dict[seq] = emb
|
| 246 |
+
|
| 247 |
+
return emb_dict
|
| 248 |
+
|
| 249 |
+
def _encode_esm2(self, sequences: List[str], prefix: str, re_embed: bool = False) -> Dict[str, torch.Tensor]:
|
| 250 |
+
"""Encode with ESM2"""
|
| 251 |
+
df_tmp = pd.DataFrame({'seq': sequences})
|
| 252 |
+
emb_dict = self.esm2_encoder.forward(
|
| 253 |
+
df_tmp,
|
| 254 |
+
seq_col='seq',
|
| 255 |
+
prefix=prefix,
|
| 256 |
+
batch_size=self.batch_size,
|
| 257 |
+
re_embed=re_embed,
|
| 258 |
+
cache_save=self.cache_save
|
| 259 |
+
)
|
| 260 |
+
return emb_dict
|
| 261 |
+
|
| 262 |
+
def _encode_structure(self, sequences: List[str], re_embed: bool = False) -> Dict[str, Tuple]:
|
| 263 |
+
"""Encode structure with ESMFold"""
|
| 264 |
+
feat_list, coor_list = self.esmfold_encoder.forward(
|
| 265 |
+
pd.DataFrame({'hla': sequences}),
|
| 266 |
+
'hla',
|
| 267 |
+
device=str(self.device),
|
| 268 |
+
re_embed=re_embed,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
struct_dict = {
|
| 272 |
+
seq: (feat, coor)
|
| 273 |
+
for seq, feat, coor in zip(sequences, feat_list, coor_list)
|
| 274 |
+
}
|
| 275 |
+
return struct_dict
|
| 276 |
+
|
| 277 |
+
def train(
|
| 278 |
+
self,
|
| 279 |
+
df_train: pd.DataFrame,
|
| 280 |
+
df_val: pd.DataFrame,
|
| 281 |
+
epochs: int = 100,
|
| 282 |
+
batch_size: int = 256,
|
| 283 |
+
lr: float = 1e-4,
|
| 284 |
+
patience: int = 5,
|
| 285 |
+
num_workers: int = 8,
|
| 286 |
+
fold_id: Optional[int] = None
|
| 287 |
+
) -> Dict[str, List[float]]:
|
| 288 |
+
"""
|
| 289 |
+
Train the model
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
df_train: Training data
|
| 293 |
+
df_val: Validation data
|
| 294 |
+
epochs: Number of epochs
|
| 295 |
+
batch_size: Batch size
|
| 296 |
+
lr: Learning rate
|
| 297 |
+
patience: Early stopping patience
|
| 298 |
+
num_workers: Number of data loading workers
|
| 299 |
+
fold_id: Fold identifier for saving (None for single model)
|
| 300 |
+
|
| 301 |
+
Returns:
|
| 302 |
+
Dictionary with training history
|
| 303 |
+
"""
|
| 304 |
+
# Check if embeddings are prepared
|
| 305 |
+
if self.phys_dict is None or self.esm2_dict is None or self.struct_dict is None:
|
| 306 |
+
raise ValueError("Embeddings not prepared! Call prepare_embeddings() first.")
|
| 307 |
+
|
| 308 |
+
# Create datasets
|
| 309 |
+
print("Creating datasets...")
|
| 310 |
+
train_dataset = PepHLA_Dataset(df_train, self.phys_dict, self.esm2_dict, self.struct_dict)
|
| 311 |
+
val_dataset = PepHLA_Dataset(df_val, self.phys_dict, self.esm2_dict, self.struct_dict)
|
| 312 |
+
|
| 313 |
+
train_loader = torch.utils.data.DataLoader(
|
| 314 |
+
train_dataset,
|
| 315 |
+
batch_size=batch_size,
|
| 316 |
+
shuffle=True,
|
| 317 |
+
num_workers=num_workers,
|
| 318 |
+
collate_fn=peptide_hla_collate_fn,
|
| 319 |
+
pin_memory=True
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
val_loader = torch.utils.data.DataLoader(
|
| 323 |
+
val_dataset,
|
| 324 |
+
batch_size=batch_size,
|
| 325 |
+
shuffle=False,
|
| 326 |
+
num_workers=num_workers,
|
| 327 |
+
collate_fn=peptide_hla_collate_fn,
|
| 328 |
+
pin_memory=True
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
# Optimizer and early stopping
|
| 332 |
+
optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
|
| 333 |
+
|
| 334 |
+
# Model save path for this fold
|
| 335 |
+
save_path = self.model_save_path if fold_id is None else \
|
| 336 |
+
self.model_save_path.replace('.pt', f'_fold{fold_id}.pt')
|
| 337 |
+
|
| 338 |
+
early_stopping = EarlyStopping(
|
| 339 |
+
patience=patience,
|
| 340 |
+
save_path=save_path
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
# Training history
|
| 344 |
+
history = {
|
| 345 |
+
'train_loss': [],
|
| 346 |
+
'val_loss': [],
|
| 347 |
+
'val_auc': [],
|
| 348 |
+
'val_prc': []
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
fold_str = f"Fold {fold_id}" if fold_id is not None else "Single model"
|
| 352 |
+
print(f"\nStarting training for {epochs} epochs [{fold_str}]...")
|
| 353 |
+
print("=" * 70)
|
| 354 |
+
|
| 355 |
+
for epoch in range(epochs):
|
| 356 |
+
# Training
|
| 357 |
+
self.model.train()
|
| 358 |
+
train_loss = 0.0
|
| 359 |
+
train_batches = 0
|
| 360 |
+
|
| 361 |
+
train_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]", leave=False, ncols=80)
|
| 362 |
+
|
| 363 |
+
for batch in train_iter:
|
| 364 |
+
optimizer.zero_grad()
|
| 365 |
+
probs, loss, _, _ = self.model(batch)
|
| 366 |
+
loss.backward()
|
| 367 |
+
optimizer.step()
|
| 368 |
+
train_loss += loss.item()
|
| 369 |
+
train_batches += 1
|
| 370 |
+
|
| 371 |
+
train_loss /= train_batches
|
| 372 |
+
|
| 373 |
+
# Validation
|
| 374 |
+
self.model.eval()
|
| 375 |
+
val_loss = 0.0
|
| 376 |
+
val_preds = []
|
| 377 |
+
val_labels = []
|
| 378 |
+
val_batches = 0
|
| 379 |
+
|
| 380 |
+
with torch.no_grad():
|
| 381 |
+
val_iter = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]", leave=False, ncols=80)
|
| 382 |
+
for batch in val_iter:
|
| 383 |
+
probs, loss, _, _ = self.model(batch)
|
| 384 |
+
val_loss += loss.item()
|
| 385 |
+
val_batches += 1
|
| 386 |
+
val_preds.extend(probs)
|
| 387 |
+
val_labels.extend(batch['label'])
|
| 388 |
+
|
| 389 |
+
val_auc = roc_auc_score(val_labels, val_preds)
|
| 390 |
+
val_loss /= val_batches
|
| 391 |
+
val_prc = average_precision_score(val_labels, val_preds)
|
| 392 |
+
|
| 393 |
+
# Record history
|
| 394 |
+
history['train_loss'].append(train_loss)
|
| 395 |
+
history['val_loss'].append(val_loss)
|
| 396 |
+
history['val_auc'].append(val_auc)
|
| 397 |
+
history['val_prc'].append(val_prc)
|
| 398 |
+
|
| 399 |
+
# Print metrics
|
| 400 |
+
print(f"[{fold_str}] Epoch [{epoch+1}/{epochs}] | "
|
| 401 |
+
f"Train Loss: {train_loss:.4f} | "
|
| 402 |
+
f"Val Loss: {val_loss:.4f} | "
|
| 403 |
+
f"Val AUC: {val_auc:.4f} | "
|
| 404 |
+
f"Val PRC: {val_prc:.4f}")
|
| 405 |
+
# Early stopping
|
| 406 |
+
early_stopping(val_prc, self.model)
|
| 407 |
+
|
| 408 |
+
if early_stopping.early_stop:
|
| 409 |
+
print(f"\n[{fold_str}] Early stopping triggered at epoch {epoch+1}!")
|
| 410 |
+
break
|
| 411 |
+
|
| 412 |
+
# Load best model
|
| 413 |
+
print(f"\n[{fold_str}] Loading best model from {save_path}...")
|
| 414 |
+
self.model.load_state_dict(torch.load(save_path))
|
| 415 |
+
|
| 416 |
+
print("=" * 70)
|
| 417 |
+
print(f"✓ Training completed for {fold_str}!")
|
| 418 |
+
|
| 419 |
+
return history
|
| 420 |
+
|
| 421 |
+
def train_kfold(
|
| 422 |
+
self,
|
| 423 |
+
train_folds: List[Tuple[pd.DataFrame, pd.DataFrame]],
|
| 424 |
+
epochs: int = 100,
|
| 425 |
+
batch_size: int = 256,
|
| 426 |
+
lr: float = 1e-4,
|
| 427 |
+
patience: int = 5,
|
| 428 |
+
num_workers: int = 8
|
| 429 |
+
) -> List[Dict[str, List[float]]]:
|
| 430 |
+
"""
|
| 431 |
+
Train K-fold cross-validation models
|
| 432 |
+
|
| 433 |
+
Args:
|
| 434 |
+
train_folds: List of (train_df, val_df) tuples for each fold
|
| 435 |
+
epochs: Number of epochs per fold
|
| 436 |
+
batch_size: Batch size
|
| 437 |
+
lr: Learning rate
|
| 438 |
+
patience: Early stopping patience
|
| 439 |
+
num_workers: Number of data loading workers
|
| 440 |
+
|
| 441 |
+
Returns:
|
| 442 |
+
List of training histories for each fold
|
| 443 |
+
"""
|
| 444 |
+
num_folds = len(train_folds)
|
| 445 |
+
all_histories = []
|
| 446 |
+
|
| 447 |
+
print("\n" + "=" * 70)
|
| 448 |
+
print(f"Starting {num_folds}-Fold Cross-Validation Training")
|
| 449 |
+
print("=" * 70)
|
| 450 |
+
|
| 451 |
+
for fold_id, (df_train, df_val) in enumerate(train_folds):
|
| 452 |
+
print(f"\n{'='*70}")
|
| 453 |
+
print(f"Training Fold {fold_id+1}/{num_folds}")
|
| 454 |
+
print(f"Train: {len(df_train)} samples | Val: {len(df_val)} samples")
|
| 455 |
+
print(f"{'='*70}")
|
| 456 |
+
|
| 457 |
+
self._set_seed(fold_id + self.seed) # Different seed for each fold
|
| 458 |
+
|
| 459 |
+
# Reinitialize model for this fold
|
| 460 |
+
self.model = PeptideHLABindingPredictor(
|
| 461 |
+
pep_dim=self.model.pep_dim,
|
| 462 |
+
hla_dim=self.model.hla_dim,
|
| 463 |
+
bilinear_dim=self.model.bilinear_dim,
|
| 464 |
+
loss_fn=self.loss_fn_name,
|
| 465 |
+
alpha=self.alpha,
|
| 466 |
+
gamma=self.gamma,
|
| 467 |
+
device=str(self.device),
|
| 468 |
+
pos_weights=self.pos_weights
|
| 469 |
+
).to(self.device)
|
| 470 |
+
|
| 471 |
+
# Train this fold
|
| 472 |
+
history = self.train(
|
| 473 |
+
df_train,
|
| 474 |
+
df_val,
|
| 475 |
+
epochs=epochs,
|
| 476 |
+
batch_size=batch_size,
|
| 477 |
+
lr=lr,
|
| 478 |
+
patience=patience,
|
| 479 |
+
num_workers=num_workers,
|
| 480 |
+
fold_id=fold_id
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
all_histories.append(history)
|
| 484 |
+
|
| 485 |
+
print("\n" + "=" * 70)
|
| 486 |
+
print(f"✓ All {num_folds} folds training completed!")
|
| 487 |
+
print("=" * 70)
|
| 488 |
+
|
| 489 |
+
# Print summary
|
| 490 |
+
print("\nCross-Validation Summary:")
|
| 491 |
+
print("-" * 70)
|
| 492 |
+
for fold_id, history in enumerate(all_histories):
|
| 493 |
+
best_auc = max(history['val_auc'])
|
| 494 |
+
best_epoch = history['val_auc'].index(best_auc) + 1
|
| 495 |
+
print(f"Fold {fold_id}: Best Val AUC = {best_auc:.4f} (Epoch {best_epoch})")
|
| 496 |
+
|
| 497 |
+
mean_auc = np.mean([max(h['val_auc']) for h in all_histories])
|
| 498 |
+
std_auc = np.std([max(h['val_auc']) for h in all_histories])
|
| 499 |
+
print("-" * 70)
|
| 500 |
+
print(f"Mean Val AUC: {mean_auc:.4f} ± {std_auc:.4f}")
|
| 501 |
+
print("=" * 70 + "\n")
|
| 502 |
+
|
| 503 |
+
return all_histories
|
| 504 |
+
|
| 505 |
+
def predict(
|
| 506 |
+
self,
|
| 507 |
+
df: pd.DataFrame,
|
| 508 |
+
batch_size: int = 256,
|
| 509 |
+
return_probs: bool = True,
|
| 510 |
+
return_attn: bool = False,
|
| 511 |
+
use_kfold: bool = False,
|
| 512 |
+
num_folds: Optional[int] = None,
|
| 513 |
+
ensemble_method: str = 'mean',
|
| 514 |
+
num_workers: int = 8
|
| 515 |
+
) -> np.ndarray:
|
| 516 |
+
"""
|
| 517 |
+
Make predictions on a dataset
|
| 518 |
+
|
| 519 |
+
Args:
|
| 520 |
+
df: DataFrame with peptide and HLA_full columns
|
| 521 |
+
batch_size: Batch size for inference
|
| 522 |
+
return_probs: If True, return probabilities; else return binary predictions
|
| 523 |
+
use_kfold: If True, use ensemble of K models
|
| 524 |
+
num_folds: Number of folds (required if use_kfold=True)
|
| 525 |
+
ensemble_method: 'mean' or 'median' for ensemble
|
| 526 |
+
|
| 527 |
+
Returns:
|
| 528 |
+
Array of predictions
|
| 529 |
+
"""
|
| 530 |
+
# Check if embeddings are prepared
|
| 531 |
+
if self.phys_dict is None or self.esm2_dict is None or self.struct_dict is None:
|
| 532 |
+
raise ValueError("Embeddings not prepared! Call prepare_embeddings() first.")
|
| 533 |
+
|
| 534 |
+
if use_kfold:
|
| 535 |
+
if num_folds is None:
|
| 536 |
+
raise ValueError("num_folds must be specified when use_kfold=True")
|
| 537 |
+
|
| 538 |
+
return self._predict_ensemble(
|
| 539 |
+
df,
|
| 540 |
+
batch_size,
|
| 541 |
+
num_folds,
|
| 542 |
+
ensemble_method,
|
| 543 |
+
return_probs,
|
| 544 |
+
return_attn,
|
| 545 |
+
num_workers
|
| 546 |
+
)
|
| 547 |
+
else:
|
| 548 |
+
# load single model
|
| 549 |
+
print(f"\nLoading model from {self.model_save_path} for prediction...")
|
| 550 |
+
self.model.load_state_dict(torch.load(self.model_save_path, map_location=self.device), strict=False)
|
| 551 |
+
# Single model prediction
|
| 552 |
+
return self._predict_single(df, batch_size, return_probs, return_attn, num_workers)
|
| 553 |
+
|
| 554 |
+
def _pad_attention(self, attns: List[np.ndarray]) -> np.ndarray:
|
| 555 |
+
"""Pad attention maps to the same length"""
|
| 556 |
+
max_len = max(a.shape[1] for a in attns)
|
| 557 |
+
attns_padded = []
|
| 558 |
+
for a in attns:
|
| 559 |
+
padding = max_len - a.shape[1]
|
| 560 |
+
pad_width_3d = ((0, 0), # 不填充 H 维度
|
| 561 |
+
(0, padding), # 填充 Lv 维度的末尾
|
| 562 |
+
(0, 0)) # 不填充 Lq 维度
|
| 563 |
+
|
| 564 |
+
attns_padded.append(np.pad(a, pad_width_3d, mode='constant', constant_values=0.0))
|
| 565 |
+
return np.concatenate(attns_padded, axis=0)
|
| 566 |
+
|
| 567 |
+
def _predict_single(
|
| 568 |
+
self,
|
| 569 |
+
df: pd.DataFrame,
|
| 570 |
+
batch_size: int,
|
| 571 |
+
return_probs: bool,
|
| 572 |
+
return_attn: bool = False,
|
| 573 |
+
num_workers: int = 8
|
| 574 |
+
) -> np.ndarray:
|
| 575 |
+
"""Single model prediction"""
|
| 576 |
+
self.model.eval()
|
| 577 |
+
|
| 578 |
+
dataset = PepHLA_Dataset(df, self.phys_dict, self.esm2_dict, self.struct_dict)
|
| 579 |
+
loader = torch.utils.data.DataLoader(
|
| 580 |
+
dataset,
|
| 581 |
+
batch_size=batch_size,
|
| 582 |
+
shuffle=False,
|
| 583 |
+
num_workers=num_workers,
|
| 584 |
+
collate_fn=peptide_hla_collate_fn,
|
| 585 |
+
pin_memory=True
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
preds = []
|
| 589 |
+
attns = []
|
| 590 |
+
with torch.no_grad():
|
| 591 |
+
for batch in tqdm(loader, desc="Predicting"):
|
| 592 |
+
probs, loss, attn, _ = self.model(batch)
|
| 593 |
+
preds.extend(probs.tolist())
|
| 594 |
+
if return_attn:
|
| 595 |
+
attns.append(attn)
|
| 596 |
+
|
| 597 |
+
preds = np.array(preds)
|
| 598 |
+
if not return_probs:
|
| 599 |
+
preds = (preds >= 0.5).astype(int)
|
| 600 |
+
|
| 601 |
+
# padding attns to the same length
|
| 602 |
+
if not return_attn:
|
| 603 |
+
return preds, None
|
| 604 |
+
else:
|
| 605 |
+
return preds, self._pad_attention(attns)
|
| 606 |
+
|
| 607 |
+
def _predict_ensemble(
|
| 608 |
+
self,
|
| 609 |
+
df: pd.DataFrame,
|
| 610 |
+
batch_size: int,
|
| 611 |
+
num_folds: int,
|
| 612 |
+
ensemble_method: str,
|
| 613 |
+
return_probs: bool,
|
| 614 |
+
return_attn: bool = False,
|
| 615 |
+
num_workers: int = 8
|
| 616 |
+
) -> np.ndarray:
|
| 617 |
+
"""Ensemble prediction using K-fold models"""
|
| 618 |
+
|
| 619 |
+
print(f"\nEnsemble prediction using {num_folds} models...")
|
| 620 |
+
print(f"Ensemble method: {ensemble_method}")
|
| 621 |
+
|
| 622 |
+
all_preds = []
|
| 623 |
+
all_attns = []
|
| 624 |
+
|
| 625 |
+
for fold_id in range(num_folds):
|
| 626 |
+
# Load fold model
|
| 627 |
+
fold_model_path = self.model_save_path.replace('.pt', f'_fold{fold_id}.pt')
|
| 628 |
+
|
| 629 |
+
if not os.path.exists(fold_model_path):
|
| 630 |
+
print(f"⚠ Warning: {fold_model_path} not found, skipping...")
|
| 631 |
+
continue
|
| 632 |
+
|
| 633 |
+
print(f"Loading model from {fold_model_path}...")
|
| 634 |
+
self.model.load_state_dict(torch.load(fold_model_path, map_location=self.device), strict=False)
|
| 635 |
+
|
| 636 |
+
# Predict with this fold
|
| 637 |
+
if not return_attn:
|
| 638 |
+
fold_preds, _ = self._predict_single(df, batch_size, return_probs=True, num_workers=num_workers)
|
| 639 |
+
else:
|
| 640 |
+
fold_preds, attn_padded = self._predict_single(df, batch_size, return_probs=True, return_attn=True, num_workers=num_workers)
|
| 641 |
+
all_attns.append(attn_padded)
|
| 642 |
+
|
| 643 |
+
all_preds.append(fold_preds)
|
| 644 |
+
|
| 645 |
+
if len(all_preds) == 0:
|
| 646 |
+
raise ValueError("No fold models found!")
|
| 647 |
+
|
| 648 |
+
# Ensemble predictions
|
| 649 |
+
all_preds = np.array(all_preds) # [num_folds, num_samples]
|
| 650 |
+
|
| 651 |
+
if ensemble_method == 'mean':
|
| 652 |
+
ensemble_preds = np.mean(all_preds, axis=0)
|
| 653 |
+
elif ensemble_method == 'median':
|
| 654 |
+
ensemble_preds = np.median(all_preds, axis=0)
|
| 655 |
+
else:
|
| 656 |
+
raise ValueError(f"Unknown ensemble method: {ensemble_method}")
|
| 657 |
+
|
| 658 |
+
print(f"✓ Ensemble prediction completed using {len(all_preds)} models")
|
| 659 |
+
|
| 660 |
+
if not return_probs:
|
| 661 |
+
ensemble_preds = (ensemble_preds >= 0.5).astype(int)
|
| 662 |
+
|
| 663 |
+
if not return_attn:
|
| 664 |
+
return ensemble_preds, None
|
| 665 |
+
else:
|
| 666 |
+
|
| 667 |
+
# num_attn_each_fold = attns_padded.shape[0] // len(all_preds)
|
| 668 |
+
# # average attns across folds
|
| 669 |
+
# attns_padded = attns_padded.reshape(len(all_preds), num_attn_each_fold, attns_padded.shape[1], attns_padded.shape[2])
|
| 670 |
+
# attns_padded = np.mean(attns_padded, axis=1)
|
| 671 |
+
return ensemble_preds, self._pad_attention(all_attns)
|
| 672 |
+
|
| 673 |
+
def evaluate(
|
| 674 |
+
self,
|
| 675 |
+
df: pd.DataFrame,
|
| 676 |
+
batch_size: int = 256,
|
| 677 |
+
threshold: float = 0.5,
|
| 678 |
+
use_kfold: bool = False,
|
| 679 |
+
num_folds: Optional[int] = None,
|
| 680 |
+
ensemble_method: str = 'mean',
|
| 681 |
+
num_workers: int = 8
|
| 682 |
+
) -> Dict[str, float]:
|
| 683 |
+
"""
|
| 684 |
+
Evaluate model on a dataset
|
| 685 |
+
|
| 686 |
+
Args:
|
| 687 |
+
df: DataFrame with peptide, HLA_full, and label columns
|
| 688 |
+
batch_size: Batch size for inference
|
| 689 |
+
threshold: Classification threshold
|
| 690 |
+
use_kfold: If True, use ensemble of K models
|
| 691 |
+
num_folds: Number of folds (required if use_kfold=True)
|
| 692 |
+
ensemble_method: 'mean' or 'median' for ensemble
|
| 693 |
+
|
| 694 |
+
Returns:
|
| 695 |
+
Dictionary of metrics
|
| 696 |
+
"""
|
| 697 |
+
y_true = df['label'].values
|
| 698 |
+
y_prob, _ = self.predict(
|
| 699 |
+
df,
|
| 700 |
+
batch_size=batch_size,
|
| 701 |
+
return_probs=True,
|
| 702 |
+
use_kfold=use_kfold,
|
| 703 |
+
num_folds=num_folds,
|
| 704 |
+
ensemble_method=ensemble_method,
|
| 705 |
+
num_workers=num_workers
|
| 706 |
+
)
|
| 707 |
+
y_pred = (y_prob >= threshold).astype(int)
|
| 708 |
+
|
| 709 |
+
# Calculate metrics
|
| 710 |
+
tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel().tolist()
|
| 711 |
+
|
| 712 |
+
accuracy = (tp + tn) / (tn + fp + fn + tp)
|
| 713 |
+
|
| 714 |
+
try:
|
| 715 |
+
mcc = ((tp*tn) - (fn*fp)) / np.sqrt(float((tp+fn)*(tn+fp)*(tp+fp)*(tn+fn)))
|
| 716 |
+
except:
|
| 717 |
+
mcc = 0.0
|
| 718 |
+
|
| 719 |
+
try:
|
| 720 |
+
recall = tp / (tp + fn)
|
| 721 |
+
except:
|
| 722 |
+
recall = 0.0
|
| 723 |
+
|
| 724 |
+
try:
|
| 725 |
+
precision = tp / (tp + fp)
|
| 726 |
+
except:
|
| 727 |
+
precision = 0.0
|
| 728 |
+
|
| 729 |
+
try:
|
| 730 |
+
f1 = 2 * precision * recall / (precision + recall)
|
| 731 |
+
except:
|
| 732 |
+
f1 = 0.0
|
| 733 |
+
|
| 734 |
+
try:
|
| 735 |
+
roc_auc = roc_auc_score(y_true, y_prob)
|
| 736 |
+
except:
|
| 737 |
+
roc_auc = 0.0
|
| 738 |
+
|
| 739 |
+
try:
|
| 740 |
+
# prc
|
| 741 |
+
from sklearn.metrics import average_precision_score
|
| 742 |
+
prc_auc = average_precision_score(y_true, y_prob)
|
| 743 |
+
except:
|
| 744 |
+
prc_auc = 0.0
|
| 745 |
+
|
| 746 |
+
# Print results
|
| 747 |
+
model_type = f"{num_folds}-Fold Ensemble ({ensemble_method})" if use_kfold else "Single Model"
|
| 748 |
+
|
| 749 |
+
print("\n" + "=" * 70)
|
| 750 |
+
print(f"Evaluation Results [{model_type}]")
|
| 751 |
+
print("=" * 70)
|
| 752 |
+
print(f"tn = {tn}, fp = {fp}, fn = {fn}, tp = {tp}")
|
| 753 |
+
print(f"y_pred: 0 = {Counter(y_pred)[0]} | 1 = {Counter(y_pred)[1]}")
|
| 754 |
+
print(f"y_true: 0 = {Counter(y_true)[0]} | 1 = {Counter(y_true)[1]}")
|
| 755 |
+
print(f"AUC: {roc_auc:.4f} | PRC: {prc_auc:.4f} | ACC: {accuracy:.4f} | MCC: {mcc:.4f} | F1: {f1:.4f}")
|
| 756 |
+
print(f"Precision: {precision:.4f} | Recall: {recall:.4f}")
|
| 757 |
+
print("=" * 70 + "\n")
|
| 758 |
+
|
| 759 |
+
return y_prob, {
|
| 760 |
+
'auc': roc_auc,
|
| 761 |
+
'prc': prc_auc,
|
| 762 |
+
'accuracy': accuracy,
|
| 763 |
+
'mcc': mcc,
|
| 764 |
+
'f1': f1,
|
| 765 |
+
'precision': precision,
|
| 766 |
+
'recall': recall,
|
| 767 |
+
'tn': tn,
|
| 768 |
+
'fp': fp,
|
| 769 |
+
'fn': fn,
|
| 770 |
+
'tp': tp
|
| 771 |
+
}
|
| 772 |
+
|
| 773 |
+
def save_model(self, path: str):
|
| 774 |
+
"""Save model weights"""
|
| 775 |
+
torch.save(self.model.state_dict(), path)
|
| 776 |
+
print(f"✓ Model saved to {path}")
|
| 777 |
+
|
| 778 |
+
def load_model(self, path: str):
|
| 779 |
+
"""Load model weights"""
|
| 780 |
+
self.model.load_state_dict(torch.load(path, map_location=self.device), strict=False)
|
| 781 |
+
print(f"✓ Model loaded from {path}")
|
| 782 |
+
|
| 783 |
+
# ============================================================================
|
| 784 |
+
|
| 785 |
+
# -*- coding: utf-8 -*-
|
| 786 |
+
import os
|
| 787 |
+
import numpy as np
|
| 788 |
+
import pandas as pd
|
| 789 |
+
from collections import Counter
|
| 790 |
+
from tqdm import tqdm
|
| 791 |
+
import torch
|
| 792 |
+
from sklearn.metrics import roc_auc_score, confusion_matrix
|
| 793 |
+
|
| 794 |
+
class StriMap_TCRpHLA:
|
| 795 |
+
"""
|
| 796 |
+
Structure-informed TCR(α/β)–peptide–HLA Binding Prediction
|
| 797 |
+
- Reuses encoders from StriMap_pHLA (phys, ESM2, ESMFold)
|
| 798 |
+
- Precomputes peptide–HLA features using pretrained StriMap_pHLA.model (PeptideHLABindingPredictor)
|
| 799 |
+
and injects them into batch during training/inference.
|
| 800 |
+
"""
|
| 801 |
+
|
| 802 |
+
def __init__(
|
| 803 |
+
self,
|
| 804 |
+
pep_hla_system = None, # already-initialized and pretrained
|
| 805 |
+
device: str = 'cuda:0',
|
| 806 |
+
model_save_path: str = 'best_model_tcrpHLA.pt',
|
| 807 |
+
tcr_dim: int = 256,
|
| 808 |
+
pep_dim: int = 256,
|
| 809 |
+
hla_dim: int = 256,
|
| 810 |
+
bilinear_dim: int = 256,
|
| 811 |
+
loss_fn: str = 'bce',
|
| 812 |
+
alpha: float = 0.5,
|
| 813 |
+
gamma: float = 2.0,
|
| 814 |
+
resample_negatives: bool = False,
|
| 815 |
+
seed: int = 1,
|
| 816 |
+
pos_weights: Optional[float] = None
|
| 817 |
+
):
|
| 818 |
+
self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
|
| 819 |
+
self.model_save_path = model_save_path
|
| 820 |
+
self.seed = seed
|
| 821 |
+
self.alpha = alpha
|
| 822 |
+
self.gamma = gamma
|
| 823 |
+
self.loss_fn_name = loss_fn
|
| 824 |
+
self.resample_negatives = resample_negatives
|
| 825 |
+
self.pos_weights = pos_weights
|
| 826 |
+
|
| 827 |
+
# seed
|
| 828 |
+
self._set_seed(seed)
|
| 829 |
+
|
| 830 |
+
if pep_hla_system is None:
|
| 831 |
+
raise ValueError("`pep_hla_system` must be provided — pass a trained StriMap_pHLA instance.")
|
| 832 |
+
|
| 833 |
+
# Reuse encoders from StriMap_pHLA
|
| 834 |
+
self.phys_encoder = pep_hla_system.phys_encoder
|
| 835 |
+
self.esm2_encoder = pep_hla_system.esm2_encoder
|
| 836 |
+
self.esmfold_encoder= pep_hla_system.esmfold_encoder
|
| 837 |
+
self.pep_hla_model = pep_hla_system.model # PeptideHLABindingPredictor with encode_peptide_hla()
|
| 838 |
+
|
| 839 |
+
# Initialize TCR–pHLA model
|
| 840 |
+
self.model = TCRPeptideHLABindingPredictor(
|
| 841 |
+
tcr_dim=tcr_dim,
|
| 842 |
+
pep_dim=pep_dim,
|
| 843 |
+
hla_dim=hla_dim,
|
| 844 |
+
bilinear_dim=bilinear_dim,
|
| 845 |
+
loss_fn=self.loss_fn_name,
|
| 846 |
+
alpha=self.alpha,
|
| 847 |
+
gamma=self.gamma,
|
| 848 |
+
pos_weights=self.pos_weights,
|
| 849 |
+
device=str(self.device),
|
| 850 |
+
).to(self.device)
|
| 851 |
+
|
| 852 |
+
# Embedding caches
|
| 853 |
+
self.phys_dict = None
|
| 854 |
+
self.esm2_dict = None
|
| 855 |
+
self.struct_dict = None
|
| 856 |
+
self.pep_hla_feat_dict = {}
|
| 857 |
+
|
| 858 |
+
print(f"✓ StriMap_TCRpHLA initialized on {self.device}")
|
| 859 |
+
|
| 860 |
+
# -------------------- utils --------------------
|
| 861 |
+
def _set_seed(self, seed: int):
|
| 862 |
+
np.random.seed(seed)
|
| 863 |
+
torch.manual_seed(seed)
|
| 864 |
+
torch.cuda.manual_seed(seed)
|
| 865 |
+
torch.cuda.manual_seed_all(seed)
|
| 866 |
+
torch.backends.cudnn.benchmark = False
|
| 867 |
+
torch.backends.cudnn.deterministic = True
|
| 868 |
+
|
| 869 |
+
# -------------------- encoders --------------------
|
| 870 |
+
def _encode_phys(self, sequences):
|
| 871 |
+
emb_dict = {}
|
| 872 |
+
batch_size = 256
|
| 873 |
+
for i in tqdm(range(0, len(sequences), batch_size), desc="Phys encoding (TCRpHLA)"):
|
| 874 |
+
batch = sequences[i:i+batch_size]
|
| 875 |
+
embs = self.phys_encoder(batch).cpu() # [B, L, D]
|
| 876 |
+
for seq, emb in zip(batch, embs):
|
| 877 |
+
emb_dict[seq] = emb
|
| 878 |
+
return emb_dict
|
| 879 |
+
|
| 880 |
+
def save_model(self, path: str):
|
| 881 |
+
torch.save(self.model.state_dict(), path)
|
| 882 |
+
print(f"✓ Model saved to {path}")
|
| 883 |
+
|
| 884 |
+
def load_model(self, path: str):
|
| 885 |
+
"""Load model weights"""
|
| 886 |
+
self.model.load_state_dict(torch.load(path, map_location=self.device))
|
| 887 |
+
print(f"✓ Model loaded from {path}")
|
| 888 |
+
|
| 889 |
+
def _encode_esm2(self, sequences, prefix: str, re_embed: bool=False):
|
| 890 |
+
df_tmp = pd.DataFrame({'seq': sequences})
|
| 891 |
+
return self.esm2_encoder.forward(
|
| 892 |
+
df_tmp, seq_col='seq', prefix=prefix, batch_size=128, re_embed=re_embed
|
| 893 |
+
)
|
| 894 |
+
|
| 895 |
+
def _encode_structure(self, sequences, prefix: str, re_embed: bool=False):
|
| 896 |
+
feat_list, coor_list = self.esmfold_encoder.forward(
|
| 897 |
+
pd.DataFrame({prefix: sequences}), prefix, device=str(self.device), re_embed=re_embed
|
| 898 |
+
)
|
| 899 |
+
return {seq: (feat, coor) for seq, feat, coor in zip(sequences, feat_list, coor_list)}
|
| 900 |
+
|
| 901 |
+
# -------------------- public: prepare embeddings --------------------
|
| 902 |
+
def prepare_embeddings(self, df: pd.DataFrame, force_recompute: bool=False):
|
| 903 |
+
"""
|
| 904 |
+
Prepare per-residue encodings for TCRα, TCRβ, peptide, and HLA.
|
| 905 |
+
Peptide structure is computed via ESMFold as requested.
|
| 906 |
+
"""
|
| 907 |
+
all_tcra = sorted(set(df['tcra'].astype(str)))
|
| 908 |
+
all_tcrb = sorted(set(df['tcrb'].astype(str)))
|
| 909 |
+
all_peps = sorted(set(df['peptide'].astype(str)))
|
| 910 |
+
all_hlas = sorted(set(df['HLA_full'].astype(str)))
|
| 911 |
+
|
| 912 |
+
self.max_pep_len = max(len(p) for p in all_peps)
|
| 913 |
+
|
| 914 |
+
print(f"\nPreparing embeddings:")
|
| 915 |
+
print(f" - TCRα: {len(all_tcra)} | TCRβ: {len(all_tcrb)} | peptides: {len(all_peps)} | HLAs: {len(all_hlas)}\n")
|
| 916 |
+
|
| 917 |
+
self.phys_dict = {
|
| 918 |
+
'tcra': self._encode_phys(all_tcra),
|
| 919 |
+
'tcrb': self._encode_phys(all_tcrb),
|
| 920 |
+
'pep': self._encode_phys(all_peps),
|
| 921 |
+
'hla': self._encode_phys(all_hlas)
|
| 922 |
+
}
|
| 923 |
+
self.esm2_dict = {
|
| 924 |
+
'tcra': self._encode_esm2(all_tcra, prefix='tcra', re_embed=force_recompute),
|
| 925 |
+
'tcrb': self._encode_esm2(all_tcrb, prefix='tcrb', re_embed=force_recompute),
|
| 926 |
+
'pep': self._encode_esm2(all_peps, prefix='pep', re_embed=force_recompute),
|
| 927 |
+
'hla': self._encode_esm2(all_hlas, prefix='hla', re_embed=force_recompute)
|
| 928 |
+
}
|
| 929 |
+
|
| 930 |
+
# Move everything in phys_dict and esm2_dict to CPU
|
| 931 |
+
for d in [self.phys_dict, self.esm2_dict]:
|
| 932 |
+
for k1 in d.keys(): # tcra / tcrb / pep / hla
|
| 933 |
+
for k2 in d[k1].keys(): # actual sequences
|
| 934 |
+
if torch.is_tensor(d[k1][k2]):
|
| 935 |
+
d[k1][k2] = d[k1][k2].cpu()
|
| 936 |
+
|
| 937 |
+
torch.cuda.empty_cache()
|
| 938 |
+
|
| 939 |
+
# IMPORTANT: include peptide structure via ESMFold
|
| 940 |
+
self.struct_dict = {
|
| 941 |
+
'tcra': self._encode_structure(all_tcra, prefix='tcra', re_embed=force_recompute),
|
| 942 |
+
'tcrb': self._encode_structure(all_tcrb, prefix='tcrb', re_embed=force_recompute),
|
| 943 |
+
'pep': self._encode_structure(all_peps, prefix='pep', re_embed=force_recompute),
|
| 944 |
+
'hla': self._encode_structure(all_hlas, prefix='hla', re_embed=force_recompute)
|
| 945 |
+
}
|
| 946 |
+
|
| 947 |
+
print("✓ Embeddings prepared for TCRα/β, peptide (with ESMFold), and HLA.")
|
| 948 |
+
|
| 949 |
+
# Move structure features to CPU
|
| 950 |
+
for part in ['tcra', 'tcrb', 'pep', 'hla']:
|
| 951 |
+
for seq, (feat, coord) in self.struct_dict[part].items():
|
| 952 |
+
self.struct_dict[part][seq] = (feat.cpu(), coord.cpu())
|
| 953 |
+
|
| 954 |
+
torch.cuda.empty_cache()
|
| 955 |
+
print("✓ All embeddings moved to CPU, GPU memory released.")
|
| 956 |
+
|
| 957 |
+
# -------------------- public: precompute pHLA features --------------------
|
| 958 |
+
def prepare_pep_hla_features(self, df: pd.DataFrame):
|
| 959 |
+
"""
|
| 960 |
+
Precompute peptide-HLA features using pretrained PeptideHLABindingPredictor.
|
| 961 |
+
The resulting features are stored in self.pep_hla_feat_dict and later injected into each batch.
|
| 962 |
+
"""
|
| 963 |
+
assert self.phys_dict is not None and self.esm2_dict is not None and self.struct_dict is not None, \
|
| 964 |
+
"Call prepare_embeddings() first."
|
| 965 |
+
|
| 966 |
+
pairs = {(row['peptide'], row['HLA_full']) for _, row in df.iterrows()}
|
| 967 |
+
self.pep_hla_model.eval()
|
| 968 |
+
for p in self.pep_hla_model.parameters():
|
| 969 |
+
p.requires_grad = False
|
| 970 |
+
|
| 971 |
+
print(f"\nPrecomputing peptide-HLA features for {len(pairs)} unique pairs...")
|
| 972 |
+
with torch.no_grad():
|
| 973 |
+
for pep, hla in tqdm(pairs, desc="pHLA features"):
|
| 974 |
+
pep_phys = self.phys_dict['pep'][pep].unsqueeze(0).to(self.device)
|
| 975 |
+
pep_esm = self.esm2_dict['pep'][pep].unsqueeze(0).to(self.device)
|
| 976 |
+
# If your PeptideHLABindingPredictor supports peptide structure, pass it too:
|
| 977 |
+
pep_struct, pep_coord = self.struct_dict['pep'][pep]
|
| 978 |
+
pep_struct = pep_struct.unsqueeze(0).to(self.device)
|
| 979 |
+
pep_coord = pep_coord.unsqueeze(0).to(self.device)
|
| 980 |
+
|
| 981 |
+
hla_phys = self.phys_dict['hla'][hla].unsqueeze(0).to(self.device)
|
| 982 |
+
hla_esm = self.esm2_dict['hla'][hla].unsqueeze(0).to(self.device)
|
| 983 |
+
hla_struct, hla_coord = self.struct_dict['hla'][hla]
|
| 984 |
+
hla_struct = hla_struct.unsqueeze(0).to(self.device)
|
| 985 |
+
hla_coord = hla_coord.unsqueeze(0).to(self.device)
|
| 986 |
+
|
| 987 |
+
# NOTE: encode_peptide_hla must accept (pep_struct, pep_coord) if you upgraded it;
|
| 988 |
+
# otherwise remove those two args.
|
| 989 |
+
pep_feat, hla_feat = self.pep_hla_model.encode_peptide_hla(
|
| 990 |
+
pep,
|
| 991 |
+
pep_phys, pep_esm,
|
| 992 |
+
hla_phys, hla_esm,
|
| 993 |
+
hla_struct, hla_coord,
|
| 994 |
+
max_pep_len=self.max_pep_len
|
| 995 |
+
)
|
| 996 |
+
self.pep_hla_feat_dict[(pep, hla)] = {
|
| 997 |
+
'pep_feat_pretrain': pep_feat.squeeze(0).cpu(), # [Lp, pep_dim]
|
| 998 |
+
'hla_feat_pretrain': hla_feat.squeeze(0).cpu() # [Lh, hla_dim]
|
| 999 |
+
}
|
| 1000 |
+
print("✓ Pretrained peptide-HLA features prepared.")
|
| 1001 |
+
|
| 1002 |
+
# -------------------- training --------------------
|
| 1003 |
+
def train(
|
| 1004 |
+
self,
|
| 1005 |
+
df_train: pd.DataFrame,
|
| 1006 |
+
df_val: Optional[pd.DataFrame] = None,
|
| 1007 |
+
df_test: Optional[pd.DataFrame] = None,
|
| 1008 |
+
df_neg: Optional[pd.DataFrame] = None,
|
| 1009 |
+
epochs: int = 100,
|
| 1010 |
+
batch_size: int = 128,
|
| 1011 |
+
lr: float = 1e-4,
|
| 1012 |
+
patience: int = 5,
|
| 1013 |
+
num_workers: int = 8,
|
| 1014 |
+
):
|
| 1015 |
+
"""
|
| 1016 |
+
Train the TCR-pHLA model.
|
| 1017 |
+
|
| 1018 |
+
Args:
|
| 1019 |
+
df_train: Training data.
|
| 1020 |
+
df_val: Optional validation data.
|
| 1021 |
+
df_test: Optional test data for evaluation after each epoch.
|
| 1022 |
+
df_neg: Optional negative samples for training. Set when resample_negatives=True.
|
| 1023 |
+
epochs: Number of epochs.
|
| 1024 |
+
batch_size: Batch size.
|
| 1025 |
+
lr: Learning rate.
|
| 1026 |
+
patience: Early stopping patience.
|
| 1027 |
+
num_workers: Data loading workers.
|
| 1028 |
+
|
| 1029 |
+
Returns:
|
| 1030 |
+
history: Dict containing training and validation metrics.
|
| 1031 |
+
"""
|
| 1032 |
+
|
| 1033 |
+
# ---- Prepare embeddings ----
|
| 1034 |
+
print("Preparing peptide-HLA features...")
|
| 1035 |
+
all_dfs = [df for df in [df_train, df_val, df_test, df_neg] if df is not None]
|
| 1036 |
+
self.prepare_pep_hla_features(pd.concat(all_dfs, axis=0))
|
| 1037 |
+
|
| 1038 |
+
# ---- Validation loader (optional) ----
|
| 1039 |
+
if df_val is not None:
|
| 1040 |
+
val_ds = TCRPepHLA_Dataset(df_val, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
|
| 1041 |
+
val_loader = torch.utils.data.DataLoader(
|
| 1042 |
+
val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers,
|
| 1043 |
+
collate_fn=tcr_pep_hla_collate_fn, pin_memory=True
|
| 1044 |
+
)
|
| 1045 |
+
stopper = EarlyStopping(patience=patience, save_path=self.model_save_path)
|
| 1046 |
+
else:
|
| 1047 |
+
val_loader, stopper = None, None
|
| 1048 |
+
|
| 1049 |
+
# ---- Optimizer ----
|
| 1050 |
+
optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
|
| 1051 |
+
|
| 1052 |
+
# ---- Metric history ----
|
| 1053 |
+
history = {'train_loss': [], 'train_auc': []}
|
| 1054 |
+
if df_val is not None:
|
| 1055 |
+
history.update({'val_loss': [], 'val_auc': [], 'val_prc': []})
|
| 1056 |
+
|
| 1057 |
+
print("\nStart training TCR–pHLA model...")
|
| 1058 |
+
df_train_pos = df_train[df_train['label'] == 1].copy().reset_index(drop=True)
|
| 1059 |
+
|
| 1060 |
+
for epoch in range(epochs):
|
| 1061 |
+
# ---------- Training ----------
|
| 1062 |
+
if self.resample_negatives:
|
| 1063 |
+
df_train_neg = negative_sampling_phla(df_train_pos, random_state=epoch)
|
| 1064 |
+
if df_neg is not None:
|
| 1065 |
+
df_train_neg = pd.concat([df_train_neg, df_neg], axis=0).reset_index(drop=True)
|
| 1066 |
+
df_train_resample = pd.concat([df_train_pos, df_train_neg], axis=0).reset_index(drop=True)
|
| 1067 |
+
train_ds = TCRPepHLA_Dataset(df_train_resample, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
|
| 1068 |
+
else:
|
| 1069 |
+
train_ds = TCRPepHLA_Dataset(df_train, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
|
| 1070 |
+
|
| 1071 |
+
train_loader = torch.utils.data.DataLoader(
|
| 1072 |
+
train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers,
|
| 1073 |
+
collate_fn=tcr_pep_hla_collate_fn, pin_memory=True
|
| 1074 |
+
)
|
| 1075 |
+
|
| 1076 |
+
self.model.train()
|
| 1077 |
+
train_labels, train_preds = [], []
|
| 1078 |
+
epoch_loss = 0.0
|
| 1079 |
+
|
| 1080 |
+
for ibatch, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")):
|
| 1081 |
+
optimizer.zero_grad()
|
| 1082 |
+
probs, loss, _, _ = self.model(batch)
|
| 1083 |
+
loss.backward()
|
| 1084 |
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=2.0)
|
| 1085 |
+
optimizer.step()
|
| 1086 |
+
|
| 1087 |
+
epoch_loss += loss.item()
|
| 1088 |
+
train_labels.extend(batch['label'].cpu().numpy().tolist())
|
| 1089 |
+
train_preds.extend(probs.detach().cpu().numpy().tolist())
|
| 1090 |
+
|
| 1091 |
+
train_auc = roc_auc_score(train_labels, train_preds)
|
| 1092 |
+
train_loss = epoch_loss / (ibatch + 1)
|
| 1093 |
+
history['train_loss'].append(train_loss)
|
| 1094 |
+
history['train_auc'].append(train_auc)
|
| 1095 |
+
print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Train AUC: {train_auc:.4f}")
|
| 1096 |
+
|
| 1097 |
+
# ---------- Validation ----------
|
| 1098 |
+
if df_val is not None:
|
| 1099 |
+
self.model.eval()
|
| 1100 |
+
val_loss_sum, val_labels, val_preds = 0.0, [], []
|
| 1101 |
+
with torch.no_grad():
|
| 1102 |
+
for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]"):
|
| 1103 |
+
probs, loss, _, _ = self.model(batch)
|
| 1104 |
+
val_loss_sum += loss.item()
|
| 1105 |
+
val_labels.extend(batch['label'].cpu().numpy().tolist())
|
| 1106 |
+
val_preds.extend(probs.detach().cpu().numpy().tolist())
|
| 1107 |
+
|
| 1108 |
+
val_loss = val_loss_sum / len(val_loader)
|
| 1109 |
+
val_auc = roc_auc_score(val_labels, val_preds)
|
| 1110 |
+
val_prc = average_precision_score(val_labels, val_preds)
|
| 1111 |
+
history['val_loss'].append(val_loss)
|
| 1112 |
+
history['val_auc'].append(val_auc)
|
| 1113 |
+
history['val_prc'].append(val_prc)
|
| 1114 |
+
print(f"Epoch {epoch+1}/{epochs} | Val AUC: {val_auc:.4f} | Val PRC: {val_prc:.4f} | Val Loss: {val_loss:.4f}")
|
| 1115 |
+
|
| 1116 |
+
stopper(val_auc, self.model)
|
| 1117 |
+
if stopper.early_stop:
|
| 1118 |
+
print(f"Early stopping at epoch {epoch+1}")
|
| 1119 |
+
break
|
| 1120 |
+
|
| 1121 |
+
# ---------- Optional Test ----------
|
| 1122 |
+
if df_test is not None:
|
| 1123 |
+
test_ds = TCRPepHLA_Dataset(df_test, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
|
| 1124 |
+
test_loader = torch.utils.data.DataLoader(
|
| 1125 |
+
test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers,
|
| 1126 |
+
collate_fn=tcr_pep_hla_collate_fn, pin_memory=True
|
| 1127 |
+
)
|
| 1128 |
+
self.model.eval()
|
| 1129 |
+
test_labels, test_preds = [], []
|
| 1130 |
+
with torch.no_grad():
|
| 1131 |
+
for batch in tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} [Test]"):
|
| 1132 |
+
probs, _, _, _ = self.model(batch)
|
| 1133 |
+
test_labels.extend(batch['label'].cpu().numpy().tolist())
|
| 1134 |
+
test_preds.extend(probs.detach().cpu().numpy().tolist())
|
| 1135 |
+
test_auc = roc_auc_score(test_labels, test_preds)
|
| 1136 |
+
test_prc = average_precision_score(test_labels, test_preds)
|
| 1137 |
+
print(f"Epoch {epoch+1}/{epochs} | Test AUC: {test_auc:.4f} | Test PRC: {test_prc:.4f}")
|
| 1138 |
+
|
| 1139 |
+
# ---- Load best model only if validation used ----
|
| 1140 |
+
if df_val is not None and os.path.exists(self.model_save_path):
|
| 1141 |
+
self.model.load_state_dict(torch.load(self.model_save_path, map_location=self.device))
|
| 1142 |
+
print(f"✓ Training finished. Best model loaded from {self.model_save_path}")
|
| 1143 |
+
else:
|
| 1144 |
+
print("✓ Training finished (no validation set used).")
|
| 1145 |
+
|
| 1146 |
+
return history
|
| 1147 |
+
|
| 1148 |
+
def train_kfold(
|
| 1149 |
+
self,
|
| 1150 |
+
train_folds: List[Tuple[pd.DataFrame, pd.DataFrame]],
|
| 1151 |
+
df_test: Optional[pd.DataFrame] = None,
|
| 1152 |
+
df_neg: Optional[pd.DataFrame] = None,
|
| 1153 |
+
epochs: int = 100,
|
| 1154 |
+
batch_size: int = 128,
|
| 1155 |
+
lr: float = 1e-4,
|
| 1156 |
+
patience: int = 8,
|
| 1157 |
+
num_workers: int = 8,
|
| 1158 |
+
) -> List[Dict[str, List[float]]]:
|
| 1159 |
+
"""
|
| 1160 |
+
K-fold cross-validation training for TCR-pHLA model.
|
| 1161 |
+
|
| 1162 |
+
Args:
|
| 1163 |
+
train_folds: list of (train_df, val_df) tuples for each fold
|
| 1164 |
+
df_test: optional test data for evaluation after each epoch
|
| 1165 |
+
df_neg: optional negative samples for training. Set when resample_negatives=True.
|
| 1166 |
+
epochs: training epochs
|
| 1167 |
+
batch_size: batch size
|
| 1168 |
+
lr: learning rate
|
| 1169 |
+
patience: early stopping patience
|
| 1170 |
+
num_workers: dataloader workers
|
| 1171 |
+
|
| 1172 |
+
Returns:
|
| 1173 |
+
List of training histories for each fold
|
| 1174 |
+
"""
|
| 1175 |
+
num_folds = len(train_folds)
|
| 1176 |
+
all_histories = []
|
| 1177 |
+
|
| 1178 |
+
print("\n" + "=" * 70)
|
| 1179 |
+
print(f"Starting {num_folds}-Fold Cross-Validation Training (TCR-pHLA)")
|
| 1180 |
+
print("=" * 70)
|
| 1181 |
+
|
| 1182 |
+
for fold_id, (df_train, df_val) in enumerate(train_folds):
|
| 1183 |
+
print(f"\n{'='*70}")
|
| 1184 |
+
print(f"Training Fold {fold_id+1}/{num_folds}")
|
| 1185 |
+
print(f"{'='*70}")
|
| 1186 |
+
|
| 1187 |
+
self._set_seed(self.seed + fold_id)
|
| 1188 |
+
|
| 1189 |
+
self.model = TCRPeptideHLABindingPredictor(
|
| 1190 |
+
tcr_dim=self.model.tcr_dim,
|
| 1191 |
+
pep_dim=self.model.pep_dim,
|
| 1192 |
+
hla_dim=self.model.hla_dim,
|
| 1193 |
+
bilinear_dim=self.model.bilinear_dim,
|
| 1194 |
+
loss_fn=self.loss_fn_name,
|
| 1195 |
+
alpha=self.alpha,
|
| 1196 |
+
gamma=self.gamma,
|
| 1197 |
+
pos_weights=self.pos_weights,
|
| 1198 |
+
device=str(self.device),
|
| 1199 |
+
).to(self.device)
|
| 1200 |
+
|
| 1201 |
+
fold_save_path = self.model_save_path.replace(".pt", f"_fold{fold_id}.pt")
|
| 1202 |
+
|
| 1203 |
+
history = self.train(
|
| 1204 |
+
df_train=df_train,
|
| 1205 |
+
df_val=df_val,
|
| 1206 |
+
df_test=df_test,
|
| 1207 |
+
df_neg=df_neg,
|
| 1208 |
+
epochs=epochs,
|
| 1209 |
+
batch_size=batch_size,
|
| 1210 |
+
lr=lr,
|
| 1211 |
+
patience=patience,
|
| 1212 |
+
num_workers=num_workers,
|
| 1213 |
+
)
|
| 1214 |
+
|
| 1215 |
+
torch.save(self.model.state_dict(), fold_save_path)
|
| 1216 |
+
print(f"✓ Saved fold {fold_id} model to {fold_save_path}")
|
| 1217 |
+
|
| 1218 |
+
all_histories.append(history)
|
| 1219 |
+
|
| 1220 |
+
print("\n" + "=" * 70)
|
| 1221 |
+
print(f"✓ All {num_folds} folds training completed (TCR-pHLA)")
|
| 1222 |
+
print("=" * 70)
|
| 1223 |
+
|
| 1224 |
+
if df_val is not None:
|
| 1225 |
+
print("\nCross-Validation Summary:")
|
| 1226 |
+
print("-" * 70)
|
| 1227 |
+
for fold_id, hist in enumerate(all_histories):
|
| 1228 |
+
best_auc = max(hist['val_auc'])
|
| 1229 |
+
best_prc = max(hist['val_prc'])
|
| 1230 |
+
best_epoch = hist['val_auc'].index(best_auc) + 1
|
| 1231 |
+
print(f"Fold {fold_id}: Best Val AUC = {best_auc:.4f}, Best Val PRC = {best_prc:.4f}, (Epoch {best_epoch})")
|
| 1232 |
+
|
| 1233 |
+
mean_auc = np.mean([max(h['val_auc']) for h in all_histories])
|
| 1234 |
+
std_auc = np.std([max(h['val_auc']) for h in all_histories])
|
| 1235 |
+
print("-" * 70)
|
| 1236 |
+
print(f"Mean Val AUC: {mean_auc:.4f} ± {std_auc:.4f}")
|
| 1237 |
+
print("=" * 70 + "\n")
|
| 1238 |
+
|
| 1239 |
+
return all_histories
|
| 1240 |
+
|
| 1241 |
+
# -------------------- single-set predict --------------------
|
| 1242 |
+
def _predict_single(
|
| 1243 |
+
self, df: pd.DataFrame,
|
| 1244 |
+
batch_size: int = 128,
|
| 1245 |
+
return_probs: bool = True,
|
| 1246 |
+
num_workers: int = 8
|
| 1247 |
+
):
|
| 1248 |
+
self.model.eval()
|
| 1249 |
+
ds = TCRPepHLA_Dataset(df, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
|
| 1250 |
+
loader = torch.utils.data.DataLoader(
|
| 1251 |
+
ds,
|
| 1252 |
+
batch_size=batch_size,
|
| 1253 |
+
shuffle=False,
|
| 1254 |
+
collate_fn=tcr_pep_hla_collate_fn,
|
| 1255 |
+
num_workers=num_workers,
|
| 1256 |
+
pin_memory=True
|
| 1257 |
+
)
|
| 1258 |
+
|
| 1259 |
+
preds = []
|
| 1260 |
+
pep_feat_all = []
|
| 1261 |
+
attn_all = []
|
| 1262 |
+
with torch.no_grad():
|
| 1263 |
+
for batch in tqdm(loader, desc="Predicting (TCR-pHLA)"):
|
| 1264 |
+
probs, _, pep_feature, attn_dict = self.model(batch)
|
| 1265 |
+
preds.extend(probs.tolist())
|
| 1266 |
+
pep_feat_all.append(pep_feature)
|
| 1267 |
+
attn_all.append(attn_dict)
|
| 1268 |
+
|
| 1269 |
+
preds = np.array(preds)
|
| 1270 |
+
|
| 1271 |
+
if not return_probs:
|
| 1272 |
+
preds = (preds >= 0.5).astype(int)
|
| 1273 |
+
|
| 1274 |
+
return preds, pep_feat_all, attn_all
|
| 1275 |
+
|
| 1276 |
+
# ================================================================
|
| 1277 |
+
# Ensemble prediction
|
| 1278 |
+
# ================================================================
|
| 1279 |
+
def _predict_ensemble(
|
| 1280 |
+
self,
|
| 1281 |
+
df: pd.DataFrame,
|
| 1282 |
+
batch_size: int = 128,
|
| 1283 |
+
num_folds: int = 5,
|
| 1284 |
+
ensemble_method: str = 'mean',
|
| 1285 |
+
return_probs: bool = True,
|
| 1286 |
+
num_workers: int = 8
|
| 1287 |
+
) -> np.ndarray:
|
| 1288 |
+
"""
|
| 1289 |
+
Ensemble prediction using multiple fold models.
|
| 1290 |
+
"""
|
| 1291 |
+
print(f"\nEnsemble prediction using {num_folds} TCR–pHLA models...")
|
| 1292 |
+
print(f"Ensemble method: {ensemble_method}")
|
| 1293 |
+
|
| 1294 |
+
pep_feats_folds = []
|
| 1295 |
+
attn_dict_folds = []
|
| 1296 |
+
all_preds = []
|
| 1297 |
+
for fold_id in range(num_folds):
|
| 1298 |
+
fold_model_path = self.model_save_path.replace(".pt", f"_fold{fold_id}.pt")
|
| 1299 |
+
if not os.path.exists(fold_model_path):
|
| 1300 |
+
print(f"⚠ Warning: {fold_model_path} not found, skipping...")
|
| 1301 |
+
continue
|
| 1302 |
+
|
| 1303 |
+
print(f"Loading model from {fold_model_path}...")
|
| 1304 |
+
self.model.load_state_dict(torch.load(fold_model_path, map_location=self.device), strict=False)
|
| 1305 |
+
|
| 1306 |
+
# Predict for this fold
|
| 1307 |
+
fold_preds, fold_pep_feature, fold_attn_dict = self._predict_single(
|
| 1308 |
+
df, batch_size=batch_size, return_probs=True, num_workers=num_workers
|
| 1309 |
+
)
|
| 1310 |
+
all_preds.append(fold_preds)
|
| 1311 |
+
pep_feats_folds.append(fold_pep_feature)
|
| 1312 |
+
attn_dict_folds.append(fold_attn_dict)
|
| 1313 |
+
|
| 1314 |
+
if len(all_preds) == 0:
|
| 1315 |
+
raise ValueError("No fold models found!")
|
| 1316 |
+
|
| 1317 |
+
if ensemble_method == 'mean':
|
| 1318 |
+
ensemble_preds = np.mean(all_preds, axis=0)
|
| 1319 |
+
elif ensemble_method == 'median':
|
| 1320 |
+
ensemble_preds = np.median(all_preds, axis=0)
|
| 1321 |
+
else:
|
| 1322 |
+
raise ValueError(f"Unknown ensemble method: {ensemble_method}")
|
| 1323 |
+
|
| 1324 |
+
print(f"✓ Ensemble prediction completed using {len(all_preds)} folds")
|
| 1325 |
+
|
| 1326 |
+
if not return_probs:
|
| 1327 |
+
ensemble_preds = (ensemble_preds >= 0.5).astype(int)
|
| 1328 |
+
|
| 1329 |
+
return ensemble_preds, pep_feats_folds, attn_dict_folds
|
| 1330 |
+
|
| 1331 |
+
|
| 1332 |
+
# ================================================================
|
| 1333 |
+
# Unified predict() with ensemble support
|
| 1334 |
+
# ================================================================
|
| 1335 |
+
def predict(
|
| 1336 |
+
self,
|
| 1337 |
+
df: pd.DataFrame,
|
| 1338 |
+
batch_size: int = 128,
|
| 1339 |
+
return_probs: bool = True,
|
| 1340 |
+
use_kfold: bool = False,
|
| 1341 |
+
num_folds: Optional[int] = None,
|
| 1342 |
+
ensemble_method: str = 'mean',
|
| 1343 |
+
num_workers: int = 8
|
| 1344 |
+
) -> Tuple[np.ndarray, List, List]:
|
| 1345 |
+
"""
|
| 1346 |
+
Predict binding probabilities or binary labels.
|
| 1347 |
+
|
| 1348 |
+
If use_kfold=True, averages predictions across fold models.
|
| 1349 |
+
"""
|
| 1350 |
+
print('Preparing peptide-HLA features for prediction set...')
|
| 1351 |
+
self.prepare_pep_hla_features(df)
|
| 1352 |
+
|
| 1353 |
+
if use_kfold:
|
| 1354 |
+
if num_folds is None:
|
| 1355 |
+
raise ValueError("num_folds must be specified when use_kfold=True")
|
| 1356 |
+
return self._predict_ensemble(
|
| 1357 |
+
df=df,
|
| 1358 |
+
batch_size=batch_size,
|
| 1359 |
+
num_folds=num_folds,
|
| 1360 |
+
ensemble_method=ensemble_method,
|
| 1361 |
+
return_probs=return_probs,
|
| 1362 |
+
num_workers=num_workers
|
| 1363 |
+
)
|
| 1364 |
+
else:
|
| 1365 |
+
return self._predict_single(df, batch_size=batch_size, return_probs=return_probs, num_workers=num_workers)
|
| 1366 |
+
|
| 1367 |
+
|
| 1368 |
+
# ================================================================
|
| 1369 |
+
# Unified evaluate() with ensemble support
|
| 1370 |
+
# ================================================================
|
| 1371 |
+
def evaluate(
|
| 1372 |
+
self,
|
| 1373 |
+
df: pd.DataFrame,
|
| 1374 |
+
batch_size: int = 128,
|
| 1375 |
+
threshold: float = 0.5,
|
| 1376 |
+
use_kfold: bool = False,
|
| 1377 |
+
num_folds: Optional[int] = None,
|
| 1378 |
+
ensemble_method: str = 'mean',
|
| 1379 |
+
num_workers: int = 8
|
| 1380 |
+
) -> Dict[str, float]:
|
| 1381 |
+
"""
|
| 1382 |
+
Evaluate model performance on a dataset.
|
| 1383 |
+
|
| 1384 |
+
If use_kfold=True, performs ensemble evaluation across folds.
|
| 1385 |
+
"""
|
| 1386 |
+
y_true = df['label'].values
|
| 1387 |
+
y_prob, all_pep_features, merged_attn = self.predict(
|
| 1388 |
+
df,
|
| 1389 |
+
batch_size=batch_size,
|
| 1390 |
+
return_probs=True,
|
| 1391 |
+
use_kfold=use_kfold,
|
| 1392 |
+
num_folds=num_folds,
|
| 1393 |
+
ensemble_method=ensemble_method,
|
| 1394 |
+
num_workers=num_workers
|
| 1395 |
+
)
|
| 1396 |
+
y_pred = (y_prob >= threshold).astype(int)
|
| 1397 |
+
|
| 1398 |
+
tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel().tolist()
|
| 1399 |
+
accuracy = (tp + tn) / (tn + fp + fn + tp + 1e-9)
|
| 1400 |
+
try:
|
| 1401 |
+
mcc = ((tp*tn) - (fn*fp)) / np.sqrt(float((tp+fn)*(tn+fp)*(tp+fp)*(tn+fn)) + 1e-9)
|
| 1402 |
+
except:
|
| 1403 |
+
mcc = 0.0
|
| 1404 |
+
recall = tp / (tp + fn + 1e-9)
|
| 1405 |
+
precision = tp / (tp + fp + 1e-9)
|
| 1406 |
+
f1 = 2 * precision * recall / (precision + recall + 1e-9)
|
| 1407 |
+
try:
|
| 1408 |
+
auc = roc_auc_score(y_true, y_prob, max_fpr=0.1)
|
| 1409 |
+
except:
|
| 1410 |
+
auc = 0.0
|
| 1411 |
+
|
| 1412 |
+
print("\n" + "=" * 70)
|
| 1413 |
+
print(f"Evaluation Results [{'K-Fold Ensemble' if use_kfold else 'Single Model'}]")
|
| 1414 |
+
print("=" * 70)
|
| 1415 |
+
print(f"tn={tn}, fp={fp}, fn={fn}, tp={tp}")
|
| 1416 |
+
print(f"AUC={auc:.4f} | ACC={accuracy:.4f} | MCC={mcc:.4f} | F1={f1:.4f} | P={precision:.4f} | R={recall:.4f}")
|
| 1417 |
+
print("=" * 70 + "\n")
|
| 1418 |
+
|
| 1419 |
+
return dict(
|
| 1420 |
+
auc=auc, accuracy=accuracy, mcc=mcc, f1=f1,
|
| 1421 |
+
precision=precision, recall=recall,
|
| 1422 |
+
tn=tn, fp=fp, fn=fn, tp=tp
|
| 1423 |
+
)
|
src/model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d6a02d441849cccafebf30918fd04d2379d8b8fe06b50de49fcabf8e86d77af
|
| 3 |
+
size 22006159
|
src/model.py
ADDED
|
@@ -0,0 +1,1995 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import Dict, List, Tuple
|
| 7 |
+
from torch.nn.utils.parametrizations import weight_norm
|
| 8 |
+
from torch.nn import TransformerEncoder, TransformerEncoderLayer
|
| 9 |
+
|
| 10 |
+
import esm
|
| 11 |
+
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
from typing import Dict, List, Tuple
|
| 15 |
+
|
| 16 |
+
import tempfile
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
import mdtraj as md
|
| 19 |
+
|
| 20 |
+
# import io
|
| 21 |
+
# import gzip
|
| 22 |
+
import os
|
| 23 |
+
|
| 24 |
+
from egnn_pytorch import EGNN
|
| 25 |
+
|
| 26 |
+
from transformers import AutoTokenizer, EsmForProteinFolding
|
| 27 |
+
|
| 28 |
+
import logging
|
| 29 |
+
logging.basicConfig(level=logging.INFO)
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
# from re import search as re_search
|
| 33 |
+
import re
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def determine_tcr_seq_vj(cdr3,V,J,chain,guess01=False):
|
| 37 |
+
|
| 38 |
+
def file2dict(filename,key_fields,store_fields,delimiter='\t'):
|
| 39 |
+
"""Read file to a dictionary.
|
| 40 |
+
key_fields: fields to be used as keys
|
| 41 |
+
store_fields: fields to be saved as a list
|
| 42 |
+
delimiter: delimiter used in the given file."""
|
| 43 |
+
dictionary={}
|
| 44 |
+
with open(filename, newline='') as csvfile:
|
| 45 |
+
reader = csv.DictReader(csvfile,delimiter=delimiter)
|
| 46 |
+
for row in reader:
|
| 47 |
+
keys = [row[k] for k in key_fields]
|
| 48 |
+
store= [row[s] for s in store_fields]
|
| 49 |
+
|
| 50 |
+
sub_dict = dictionary
|
| 51 |
+
for key in keys[:-1]:
|
| 52 |
+
if key not in sub_dict:
|
| 53 |
+
sub_dict[key] = {}
|
| 54 |
+
sub_dict = sub_dict[key]
|
| 55 |
+
key = keys[-1]
|
| 56 |
+
if key not in sub_dict:
|
| 57 |
+
sub_dict[key] = []
|
| 58 |
+
sub_dict[key].append(store)
|
| 59 |
+
return dictionary
|
| 60 |
+
|
| 61 |
+
def get_protseqs_ntseqs(chain='B'):
|
| 62 |
+
"""returns sequence dictioaries for genes: protseqsV, protseqsJ, nucseqsV, nucseqsJ"""
|
| 63 |
+
seq_dicts=[]
|
| 64 |
+
for gene,type in zip(['v','j','v','j'],['aa','aa','nt','nt']):
|
| 65 |
+
name = 'library/'+'tr'+chain.lower()+gene+'s_'+type+'.tsv'
|
| 66 |
+
sdict = file2dict(name,key_fields=['Allele'],store_fields=[type+'_seq'])
|
| 67 |
+
for g in sdict:
|
| 68 |
+
sdict[g]=sdict[g][0][0]
|
| 69 |
+
seq_dicts.append(sdict)
|
| 70 |
+
return seq_dicts
|
| 71 |
+
|
| 72 |
+
protVb,protJb,_,_ = get_protseqs_ntseqs(chain='B')
|
| 73 |
+
protVa,protJa,_,_ = get_protseqs_ntseqs(chain='A')
|
| 74 |
+
|
| 75 |
+
def splice_v_cdr3_j(pv: str, pj: str, cdr3: str) -> str:
|
| 76 |
+
"""
|
| 77 |
+
pv: V gene protein sequence
|
| 78 |
+
pj: J gene protein sequence
|
| 79 |
+
cdr3: C-starting, F/W-ending CDR3 sequence (protein)
|
| 80 |
+
Returns: The spliced full sequence (V[:lastC] + CDR3 + J suffix)
|
| 81 |
+
"""
|
| 82 |
+
pv = (pv or "").strip().upper()
|
| 83 |
+
pj = (pj or "").strip().upper()
|
| 84 |
+
cdr3 = (cdr3 or "").strip().upper()
|
| 85 |
+
|
| 86 |
+
# 1) V segment: Take the last 'C' (including the conserved C in V region)
|
| 87 |
+
cpos = pv.rfind('C')
|
| 88 |
+
if cpos == -1:
|
| 89 |
+
raise ValueError("V sequence has no 'C' to anchor CDR3 start.")
|
| 90 |
+
v_prefix = pv[:cpos] # up to and including C
|
| 91 |
+
|
| 92 |
+
# 2) Align CDR3's "end overlap" in J
|
| 93 |
+
# Start from the full length of cdr3, gradually shorten it, and find the longest suffix that can match in J
|
| 94 |
+
j_suffix = pj # fallback (in extreme cases)
|
| 95 |
+
for k in range(len(cdr3), 0, -1):
|
| 96 |
+
tail = cdr3[-k:] # CDR3's suffix
|
| 97 |
+
m = re.search(re.escape(tail), pj)
|
| 98 |
+
if m:
|
| 99 |
+
j_suffix = pj[m.end():] # Take the suffix from the matching segment
|
| 100 |
+
break
|
| 101 |
+
|
| 102 |
+
return v_prefix + cdr3 + j_suffix
|
| 103 |
+
|
| 104 |
+
tcr_list = []
|
| 105 |
+
for i in range(len(cdr3)):
|
| 106 |
+
cdr3_ = cdr3[i]
|
| 107 |
+
V_ = V[i]
|
| 108 |
+
J_ = J[i]
|
| 109 |
+
if chain=='A':
|
| 110 |
+
protseqsV = protVa
|
| 111 |
+
protseqsJ = protJa
|
| 112 |
+
else:
|
| 113 |
+
protseqsV = protVb
|
| 114 |
+
protseqsJ = protJb
|
| 115 |
+
if guess01:
|
| 116 |
+
if '*' not in V_:
|
| 117 |
+
V_+='*01'
|
| 118 |
+
if '*' not in J_:
|
| 119 |
+
J_+='*01'
|
| 120 |
+
pv = protseqsV[V_]
|
| 121 |
+
pj = protseqsJ[J_]
|
| 122 |
+
# t = pv[:pv.rfind('C')]+ cdr3_ + pj[re_search(r'[FW]G.[GV]',pj).start()+1:]
|
| 123 |
+
t = splice_v_cdr3_j(pv, pj, cdr3_)
|
| 124 |
+
tcr_list.append(t)
|
| 125 |
+
return tcr_list
|
| 126 |
+
|
| 127 |
+
# def negative_sampling_phla(df, neg_ratio=5, label_col='label', neg_label=0, random_state=42):
|
| 128 |
+
# """
|
| 129 |
+
# Create negative samples by shuffling the TCR sequences while keeping the peptide-HLA pairs intact.
|
| 130 |
+
# Ensures that the generated (TCR, peptide, HLA) triplets do not exist in the original dataset.
|
| 131 |
+
# """
|
| 132 |
+
# negative_samples = []
|
| 133 |
+
|
| 134 |
+
# # 正样本 triplet 集合
|
| 135 |
+
# pos_triplets = set(zip(
|
| 136 |
+
# df['tcra'], df['tcrb'], df['peptide'], df['HLA_full']
|
| 137 |
+
# ))
|
| 138 |
+
|
| 139 |
+
# for i in range(neg_ratio):
|
| 140 |
+
# shuffled_df = df.copy()
|
| 141 |
+
|
| 142 |
+
# tcr_cols = ['tcra', 'cdr3a_start', 'cdr3a_end', 'tcrb', 'cdr3b_start', 'cdr3b_end']
|
| 143 |
+
# shuffled_tcr = df[tcr_cols].sample(frac=1, random_state=random_state + i).reset_index(drop=True)
|
| 144 |
+
|
| 145 |
+
# for col in tcr_cols:
|
| 146 |
+
# shuffled_df[col] = shuffled_tcr[col]
|
| 147 |
+
|
| 148 |
+
# # 剔除:1) TCR 未改变的行 2) triplet 与正样本重复
|
| 149 |
+
# mask_keep = []
|
| 150 |
+
# for idx, row in shuffled_df.iterrows():
|
| 151 |
+
# triplet = (row['tcra'], row['tcrb'], row['peptide'], row['HLA_full'])
|
| 152 |
+
# if triplet in pos_triplets:
|
| 153 |
+
# mask_keep.append(False)
|
| 154 |
+
# else:
|
| 155 |
+
# mask_keep.append(True)
|
| 156 |
+
|
| 157 |
+
# shuffled_df = shuffled_df[mask_keep]
|
| 158 |
+
# shuffled_df[label_col] = neg_label
|
| 159 |
+
|
| 160 |
+
# negative_samples.append(shuffled_df)
|
| 161 |
+
|
| 162 |
+
# negative_samples = pd.concat(negative_samples, ignore_index=True).drop_duplicates()
|
| 163 |
+
# return negative_samples
|
| 164 |
+
|
| 165 |
+
import numpy as np
|
| 166 |
+
import pandas as pd
|
| 167 |
+
|
| 168 |
+
# def balanced_negative_sampling_phla(df, label_col='label', neg_label=0, random_state=42):
|
| 169 |
+
# """
|
| 170 |
+
# 为每个 (peptide, HLA_full) 平衡采样负样本:
|
| 171 |
+
# - 找出正样本最多的 peptide
|
| 172 |
+
# - 该 peptide 的负样本数量 = 1:1,从其他 peptide 的 TCR 中采样(保持 peptide–HLA 配对)
|
| 173 |
+
# - 其他 peptide 采样负样本,使每个 peptide 拥有相同总样本数
|
| 174 |
+
# - 保证 peptide 与 HLA_full 始终保持配对关系
|
| 175 |
+
# """
|
| 176 |
+
# np.random.seed(random_state)
|
| 177 |
+
|
| 178 |
+
# pos_df = df[df[label_col] != neg_label].copy()
|
| 179 |
+
# pos_counts = pos_df['peptide'].value_counts()
|
| 180 |
+
# max_peptide = pos_counts.idxmax()
|
| 181 |
+
# max_pos = pos_counts.max()
|
| 182 |
+
# total_target = max_pos * 2 # 每个 peptide 的最终样本数(正+负)
|
| 183 |
+
|
| 184 |
+
# neg_samples = []
|
| 185 |
+
|
| 186 |
+
# # 针对 max_peptide:负样本 = 1:1
|
| 187 |
+
# df_other_tcrs = pos_df[pos_df['peptide'] != max_peptide][['tcra', 'tcrb', 'cdr3a_start', 'cdr3a_end', 'cdr3b_start', 'cdr3b_end']].copy()
|
| 188 |
+
# neg_max = pos_df[pos_df['peptide'] == max_peptide].copy()
|
| 189 |
+
# sampled_tcrs = df_other_tcrs.sample(
|
| 190 |
+
# n=max_pos,
|
| 191 |
+
# replace=True if len(df_other_tcrs) < max_pos else False,
|
| 192 |
+
# random_state=random_state
|
| 193 |
+
# ).reset_index(drop=True)
|
| 194 |
+
# neg_max.update(sampled_tcrs)
|
| 195 |
+
# neg_max[label_col] = neg_label
|
| 196 |
+
# neg_samples.append(neg_max)
|
| 197 |
+
|
| 198 |
+
# # 针对其他 peptides
|
| 199 |
+
# for pep, n_pos in pos_counts.items():
|
| 200 |
+
# if pep == max_peptide:
|
| 201 |
+
# continue
|
| 202 |
+
# n_neg = max(0, total_target - n_pos)
|
| 203 |
+
# df_other_tcrs = pos_df[pos_df['peptide'] != pep][['tcra', 'tcrb', 'cdr3a_start', 'cdr3a_end', 'cdr3b_start', 'cdr3b_end']].copy()
|
| 204 |
+
# neg_pep = pos_df[pos_df['peptide'] == pep].copy()
|
| 205 |
+
# sampled_tcrs = df_other_tcrs.sample(
|
| 206 |
+
# n=min(len(df_other_tcrs), n_neg),
|
| 207 |
+
# replace=True if len(df_other_tcrs) < n_neg else False,
|
| 208 |
+
# random_state=random_state
|
| 209 |
+
# ).reset_index(drop=True)
|
| 210 |
+
# sampled_tcrs = sampled_tcrs.iloc[:len(neg_pep)].copy() if len(sampled_tcrs) > len(neg_pep) else sampled_tcrs
|
| 211 |
+
# neg_pep = pd.concat(
|
| 212 |
+
# [neg_pep]*int(np.ceil(n_neg / len(neg_pep))), ignore_index=True
|
| 213 |
+
# ).iloc[:n_neg]
|
| 214 |
+
# neg_pep.update(sampled_tcrs)
|
| 215 |
+
# neg_pep[label_col] = neg_label
|
| 216 |
+
# neg_samples.append(neg_pep)
|
| 217 |
+
|
| 218 |
+
# neg_df = pd.concat(neg_samples, ignore_index=True)
|
| 219 |
+
# final_df = pd.concat([pos_df, neg_df], ignore_index=True).reset_index(drop=True)
|
| 220 |
+
|
| 221 |
+
# return final_df
|
| 222 |
+
|
| 223 |
+
def negative_sampling_phla(df, neg_ratio=5, label_col='label', neg_label=0, random_state=42):
|
| 224 |
+
"""
|
| 225 |
+
Create negative samples by shuffling TCRs while keeping peptide–HLA pairs intact.
|
| 226 |
+
Ensures negative samples count = neg_ratio × positive samples count.
|
| 227 |
+
"""
|
| 228 |
+
np.random.seed(random_state)
|
| 229 |
+
pos_triplets = set(zip(df['tcra'], df['tcrb'], df['peptide'], df['HLA_full']))
|
| 230 |
+
tcr_cols = ['tcra', 'cdr3a_start', 'cdr3a_end', 'tcrb', 'cdr3b_start', 'cdr3b_end']
|
| 231 |
+
|
| 232 |
+
n_pos = len(df)
|
| 233 |
+
target_n_neg = n_pos * neg_ratio
|
| 234 |
+
all_neg = []
|
| 235 |
+
|
| 236 |
+
i = 0
|
| 237 |
+
while len(all_neg) < target_n_neg:
|
| 238 |
+
shuffled_df = df.copy()
|
| 239 |
+
shuffled_tcr = df[tcr_cols].sample(frac=1, random_state=random_state + i).reset_index(drop=True)
|
| 240 |
+
for col in tcr_cols:
|
| 241 |
+
shuffled_df[col] = shuffled_tcr[col]
|
| 242 |
+
|
| 243 |
+
mask_keep = []
|
| 244 |
+
for idx, row in shuffled_df.iterrows():
|
| 245 |
+
triplet = (row['tcra'], row['tcrb'], row['peptide'], row['HLA_full'])
|
| 246 |
+
mask_keep.append(triplet not in pos_triplets)
|
| 247 |
+
shuffled_df = shuffled_df[mask_keep]
|
| 248 |
+
shuffled_df[label_col] = neg_label
|
| 249 |
+
|
| 250 |
+
all_neg.append(shuffled_df)
|
| 251 |
+
i += 1
|
| 252 |
+
|
| 253 |
+
if len(pd.concat(all_neg)) > target_n_neg * 1.5:
|
| 254 |
+
break
|
| 255 |
+
|
| 256 |
+
negative_samples = pd.concat(all_neg, ignore_index=True).drop_duplicates()
|
| 257 |
+
negative_samples = negative_samples.sample(
|
| 258 |
+
n=min(len(negative_samples), target_n_neg), random_state=random_state
|
| 259 |
+
).reset_index(drop=True)
|
| 260 |
+
|
| 261 |
+
return negative_samples
|
| 262 |
+
|
| 263 |
+
# def negative_sampling_tcr(df, neg_ratio=5, label_col='label', neg_label=0, random_state=42):
|
| 264 |
+
# """
|
| 265 |
+
# Create negative samples by keeping TCR fixed but assigning random (peptide, HLA_full)
|
| 266 |
+
# pairs that do not exist in the original dataset.
|
| 267 |
+
# Ensures that the generated (TCR, peptide, HLA) triplets do not exist in the original data.
|
| 268 |
+
# """
|
| 269 |
+
# np.random.seed(random_state)
|
| 270 |
+
# negative_samples = []
|
| 271 |
+
|
| 272 |
+
# pos_triplets = set(zip(df['tcra'], df['tcrb'], df['peptide'], df['HLA_full']))
|
| 273 |
+
|
| 274 |
+
# all_pairs = list(set(zip(df['peptide'], df['HLA_full'])))
|
| 275 |
+
|
| 276 |
+
# for i in range(neg_ratio):
|
| 277 |
+
# neg_df = df.copy()
|
| 278 |
+
|
| 279 |
+
# # 随机打乱 peptide–HLA 对,但保证不会选原来的那一个
|
| 280 |
+
# new_pairs = []
|
| 281 |
+
# for _, row in df.iterrows():
|
| 282 |
+
# while True:
|
| 283 |
+
# pep, hla = all_pairs[np.random.randint(len(all_pairs))]
|
| 284 |
+
# triplet = (row['tcra'], row['tcrb'], pep, hla)
|
| 285 |
+
# if triplet not in pos_triplets:
|
| 286 |
+
# new_pairs.append((pep, hla))
|
| 287 |
+
# break
|
| 288 |
+
|
| 289 |
+
# neg_df[['peptide', 'HLA_full']] = pd.DataFrame(new_pairs, index=neg_df.index)
|
| 290 |
+
# neg_df[label_col] = neg_label
|
| 291 |
+
# negative_samples.append(neg_df)
|
| 292 |
+
|
| 293 |
+
# negative_samples = pd.concat(negative_samples, ignore_index=True).drop_duplicates()
|
| 294 |
+
# return negative_samples
|
| 295 |
+
|
| 296 |
+
class EarlyStopping:
|
| 297 |
+
def __init__(self, patience=10, verbose=True, delta=0.0, save_path='checkpoint.pt'):
|
| 298 |
+
"""
|
| 299 |
+
Early stopping based on both val_loss and val_auc.
|
| 300 |
+
The model is saved whenever EITHER:
|
| 301 |
+
- val_loss decreases by more than delta, OR
|
| 302 |
+
- val_auc increases by more than delta.
|
| 303 |
+
"""
|
| 304 |
+
self.patience = patience
|
| 305 |
+
self.verbose = verbose
|
| 306 |
+
self.counter = 0
|
| 307 |
+
self.early_stop = False
|
| 308 |
+
self.delta = delta
|
| 309 |
+
self.save_path = save_path
|
| 310 |
+
|
| 311 |
+
self.best_loss = np.inf
|
| 312 |
+
self.best_auc = -np.inf
|
| 313 |
+
|
| 314 |
+
def __call__(self, val_auc, model):
|
| 315 |
+
improved = False
|
| 316 |
+
|
| 317 |
+
# Check auc improvement
|
| 318 |
+
if val_auc > self.best_auc + self.delta:
|
| 319 |
+
self.best_auc = val_auc
|
| 320 |
+
improved = True
|
| 321 |
+
|
| 322 |
+
if improved:
|
| 323 |
+
self.save_checkpoint(model, val_auc)
|
| 324 |
+
self.counter = 0
|
| 325 |
+
else:
|
| 326 |
+
self.counter += 1
|
| 327 |
+
if self.verbose:
|
| 328 |
+
print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
|
| 329 |
+
if self.counter >= self.patience:
|
| 330 |
+
self.early_stop = True
|
| 331 |
+
|
| 332 |
+
def save_checkpoint(self, model, val_auc):
|
| 333 |
+
"""Save current best model."""
|
| 334 |
+
if self.verbose:
|
| 335 |
+
print(f"Validation improved → Saving model (Score={val_auc:.4f}) to {self.save_path}")
|
| 336 |
+
torch.save(model.state_dict(), self.save_path)
|
| 337 |
+
|
| 338 |
+
# ============================================================================
|
| 339 |
+
# ESM2 Embedding via HuggingFace
|
| 340 |
+
# ============================================================================
|
| 341 |
+
class ESM2Encoder(nn.Module):
|
| 342 |
+
def __init__(self,
|
| 343 |
+
device="cuda:0",
|
| 344 |
+
layer=33,
|
| 345 |
+
cache_dir='cache'):
|
| 346 |
+
"""
|
| 347 |
+
Initialize an ESM2 encoder.
|
| 348 |
+
|
| 349 |
+
Args:
|
| 350 |
+
model_name (str): Name of the pretrained ESM2 model (e.g., 'esm2_t33_650M_UR50D').
|
| 351 |
+
device (str): Device to run on, e.g. 'cuda:0', 'cuda:1', or 'cpu'.
|
| 352 |
+
layer (int): Layer number from which to extract representations.
|
| 353 |
+
"""
|
| 354 |
+
super().__init__()
|
| 355 |
+
self.device = device
|
| 356 |
+
self.layer = layer
|
| 357 |
+
|
| 358 |
+
if cache_dir is None:
|
| 359 |
+
cache_dir = os.path.dirname(os.path.abspath(__file__))
|
| 360 |
+
self.cache_dir = cache_dir
|
| 361 |
+
os.makedirs(self.cache_dir, exist_ok=True)
|
| 362 |
+
|
| 363 |
+
self.model, self.alphabet = esm.pretrained.esm2_t33_650M_UR50D()
|
| 364 |
+
self.batch_converter = self.alphabet.get_batch_converter()
|
| 365 |
+
self.model = self.model.eval().to(device)
|
| 366 |
+
|
| 367 |
+
def _cache_path(self, prefix):
|
| 368 |
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
| 369 |
+
base_dir = base_dir + "/" + self.cache_dir
|
| 370 |
+
os.makedirs(base_dir, exist_ok=True)
|
| 371 |
+
return os.path.join(base_dir, f"{prefix}_esm2_layer{self.layer}.pt")
|
| 372 |
+
|
| 373 |
+
def save_obj(self, obj, path):
|
| 374 |
+
"""Save object to a file (no compression)."""
|
| 375 |
+
torch.save(obj, path)
|
| 376 |
+
|
| 377 |
+
def load_obj(self, path):
|
| 378 |
+
"""Load object from a file (no compression)."""
|
| 379 |
+
return torch.load(path, map_location="cpu", weights_only=False)
|
| 380 |
+
|
| 381 |
+
@torch.no_grad()
|
| 382 |
+
def _embed_batch(self, batch_data):
|
| 383 |
+
batch_labels, batch_strs, batch_tokens = self.batch_converter(batch_data)
|
| 384 |
+
batch_tokens = batch_tokens.to(self.device)
|
| 385 |
+
results = self.model(batch_tokens, repr_layers=[self.layer], return_contacts=False)
|
| 386 |
+
token_representations = results["representations"][self.layer]
|
| 387 |
+
batch_lens = (batch_tokens != self.alphabet.padding_idx).sum(1)
|
| 388 |
+
seq_reprs = []
|
| 389 |
+
for i, tokens_len in enumerate(batch_lens):
|
| 390 |
+
seq_repr = token_representations[i, 1:tokens_len-1].cpu()
|
| 391 |
+
seq_reprs.append(seq_repr)
|
| 392 |
+
return seq_reprs
|
| 393 |
+
|
| 394 |
+
@torch.no_grad()
|
| 395 |
+
def forward(self, df, seq_col, prefix, batch_size=64, re_embed=False, cache_save=True):
|
| 396 |
+
"""
|
| 397 |
+
Add or update embeddings for sequences in a DataFrame.
|
| 398 |
+
- If there are new sequences, automatically update the dictionary and save.
|
| 399 |
+
- If re_embed=True, force re-computation of all sequences.
|
| 400 |
+
"""
|
| 401 |
+
cache_path = self._cache_path(prefix)
|
| 402 |
+
emb_dict = {}
|
| 403 |
+
|
| 404 |
+
if os.path.exists(cache_path) and not re_embed:
|
| 405 |
+
print(f"[ESM2] Loading cached embeddings from {cache_path}")
|
| 406 |
+
emb_dict = self.load_obj(cache_path)
|
| 407 |
+
else:
|
| 408 |
+
if re_embed:
|
| 409 |
+
print(f"[ESM2] Re-embedding all sequences for {prefix}")
|
| 410 |
+
else:
|
| 411 |
+
print(f"[ESM2] No existing cache for {prefix}, will create new.")
|
| 412 |
+
|
| 413 |
+
seqs = [str(s).strip().upper() for s in df[seq_col].tolist() if isinstance(s, str)]
|
| 414 |
+
unique_seqs = sorted(set(seqs))
|
| 415 |
+
new_seqs = [s for s in unique_seqs if s not in emb_dict]
|
| 416 |
+
|
| 417 |
+
if new_seqs:
|
| 418 |
+
print(f"[ESM2] Found {len(new_seqs)} new sequences → computing embeddings...")
|
| 419 |
+
data = [(str(i), s) for i, s in enumerate(new_seqs)]
|
| 420 |
+
for i in tqdm(range(0, len(data), batch_size), desc=f"ESM2 update ({prefix})"):
|
| 421 |
+
batch = data[i:i+batch_size]
|
| 422 |
+
embs = self._embed_batch(batch)
|
| 423 |
+
for (_, seq), emb in zip(batch, embs):
|
| 424 |
+
emb_dict[seq] = emb.clone()
|
| 425 |
+
if cache_save:
|
| 426 |
+
print(f"[ESM2] Updating cache with new sequences")
|
| 427 |
+
self.save_obj(emb_dict, cache_path)
|
| 428 |
+
else:
|
| 429 |
+
print(f"[ESM2] No new sequences for {prefix}, using existing cache")
|
| 430 |
+
|
| 431 |
+
return emb_dict
|
| 432 |
+
|
| 433 |
+
# ============================================================================
|
| 434 |
+
# ESMFold (transformers)
|
| 435 |
+
# ============================================================================
|
| 436 |
+
class ESMFoldPredictorHF(nn.Module):
|
| 437 |
+
def __init__(self,
|
| 438 |
+
model_name="facebook/esmfold_v1",
|
| 439 |
+
cache_dir=None,
|
| 440 |
+
device='cpu',
|
| 441 |
+
allow_tf32=True):
|
| 442 |
+
super().__init__()
|
| 443 |
+
self.model_name = model_name
|
| 444 |
+
self.cache_dir = cache_dir
|
| 445 |
+
self.device = device
|
| 446 |
+
if allow_tf32:
|
| 447 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 448 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 449 |
+
|
| 450 |
+
# tokenizer and model
|
| 451 |
+
print(f"Loading ESMFold model {model_name} on {device}... {'with' if cache_dir else 'without'} cache_dir: {cache_dir}")
|
| 452 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
|
| 453 |
+
self.model = EsmForProteinFolding.from_pretrained(
|
| 454 |
+
model_name, low_cpu_mem_usage=True, cache_dir=cache_dir
|
| 455 |
+
).eval().to(self.device)
|
| 456 |
+
|
| 457 |
+
@torch.no_grad()
|
| 458 |
+
def infer_pdb_str(self, seq: str) -> str:
|
| 459 |
+
pdb_str = self.model.infer_pdb(seq)
|
| 460 |
+
return pdb_str
|
| 461 |
+
|
| 462 |
+
@torch.no_grad()
|
| 463 |
+
def forward_raw(self, seq: str):
|
| 464 |
+
inputs = self.tokenizer([seq], return_tensors="pt", add_special_tokens=False)
|
| 465 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 466 |
+
outputs = self.model(**inputs)
|
| 467 |
+
return outputs # ESMFoldOutput
|
| 468 |
+
|
| 469 |
+
MAX_ASA_TIEN = {
|
| 470 |
+
"ALA": 129.0, "ARG": 274.0, "ASN": 195.0, "ASP": 193.0, "CYS": 167.0,
|
| 471 |
+
"GLN": 225.0, "GLU": 223.0, "GLY": 104.0, "HIS": 224.0, "ILE": 197.0,
|
| 472 |
+
"LEU": 201.0, "LYS": 236.0, "MET": 224.0, "PHE": 240.0, "PRO": 159.0,
|
| 473 |
+
"SER": 155.0, "THR": 172.0, "TRP": 285.0, "TYR": 263.0, "VAL": 174.0,
|
| 474 |
+
}
|
| 475 |
+
SS8_INDEX = {"H":0,"B":1,"E":2,"G":3,"I":4,"T":5,"S":6,"C":7,"-":7}
|
| 476 |
+
|
| 477 |
+
class StructureFeatureExtractorNoDSSP(nn.Module):
|
| 478 |
+
def __init__(self, device="cpu"):
|
| 479 |
+
super().__init__()
|
| 480 |
+
self.device = device
|
| 481 |
+
|
| 482 |
+
self.in_dim = 6 + 8 + 1 + 1 + 1 # 17
|
| 483 |
+
|
| 484 |
+
self.to(torch.device(self.device))
|
| 485 |
+
|
| 486 |
+
@torch.no_grad()
|
| 487 |
+
def _angles(self, traj):
|
| 488 |
+
|
| 489 |
+
L = traj.n_residues
|
| 490 |
+
|
| 491 |
+
sphi = np.zeros(L, dtype=np.float32); cphi = np.zeros(L, dtype=np.float32)
|
| 492 |
+
spsi = np.zeros(L, dtype=np.float32); cpsi = np.zeros(L, dtype=np.float32)
|
| 493 |
+
someg = np.zeros(L, dtype=np.float32); comeg = np.zeros(L, dtype=np.float32)
|
| 494 |
+
|
| 495 |
+
# 1) phi: (C_{i-1}, N_i, CA_i, C_i) —— 当前残基 i 可用 atoms[1] (N_i) 来定位
|
| 496 |
+
phi_idx, phi_vals = md.compute_phi(traj) # phi_vals: (1, n_phi)
|
| 497 |
+
if phi_vals.size > 0:
|
| 498 |
+
for k, atoms in enumerate(phi_idx):
|
| 499 |
+
res_i = traj.topology.atom(int(atoms[1])).residue.index # N_i 所在残基
|
| 500 |
+
if 0 <= res_i < L:
|
| 501 |
+
ang = float(phi_vals[0, k])
|
| 502 |
+
sphi[res_i] = np.sin(ang); cphi[res_i] = np.cos(ang)
|
| 503 |
+
|
| 504 |
+
# 2) psi: (N_i, CA_i, C_i, N_{i+1}) —— 当前残基 i 可用 atoms[1] (CA_i)
|
| 505 |
+
psi_idx, psi_vals = md.compute_psi(traj)
|
| 506 |
+
if psi_vals.size > 0:
|
| 507 |
+
for k, atoms in enumerate(psi_idx):
|
| 508 |
+
res_i = traj.topology.atom(int(atoms[1])).residue.index # CA_i
|
| 509 |
+
if 0 <= res_i < L:
|
| 510 |
+
ang = float(psi_vals[0, k])
|
| 511 |
+
spsi[res_i] = np.sin(ang); cpsi[res_i] = np.cos(ang)
|
| 512 |
+
|
| 513 |
+
# 3) omega: (CA_i, C_i, N_{i+1}, CA_{i+1}) —— 当前残基 i 可用 atoms[0] (CA_i)
|
| 514 |
+
omg_idx, omg_vals = md.compute_omega(traj)
|
| 515 |
+
if omg_vals.size > 0:
|
| 516 |
+
for k, atoms in enumerate(omg_idx):
|
| 517 |
+
res_i = traj.topology.atom(int(atoms[0])).residue.index # CA_i
|
| 518 |
+
if 0 <= res_i < L:
|
| 519 |
+
ang = float(omg_vals[0, k])
|
| 520 |
+
someg[res_i] = np.sin(ang); comeg[res_i] = np.cos(ang)
|
| 521 |
+
|
| 522 |
+
angles_feat = np.stack([sphi, cphi, spsi, cpsi, someg, comeg], axis=-1) # [L, 6]
|
| 523 |
+
return angles_feat.astype(np.float32)
|
| 524 |
+
|
| 525 |
+
@torch.no_grad()
|
| 526 |
+
def _ss8(self, traj: md.Trajectory):
|
| 527 |
+
ss = md.compute_dssp(traj, simplified=False)[0]
|
| 528 |
+
L = traj.n_residues
|
| 529 |
+
onehot = np.zeros((L, 8), dtype=np.float32)
|
| 530 |
+
for i, ch in enumerate(ss):
|
| 531 |
+
onehot[i, SS8_INDEX.get(ch, 7)] = 1.0
|
| 532 |
+
return onehot
|
| 533 |
+
|
| 534 |
+
@torch.no_grad()
|
| 535 |
+
def _rsa(self, traj: md.Trajectory):
|
| 536 |
+
asa = md.shrake_rupley(traj, mode="residue")[0] # (L,)
|
| 537 |
+
rsa = np.zeros_like(asa, dtype=np.float32)
|
| 538 |
+
for i, res in enumerate(traj.topology.residues):
|
| 539 |
+
max_asa = MAX_ASA_TIEN.get(res.name.upper(), None)
|
| 540 |
+
rsa[i] = 0.0 if not max_asa else float(asa[i] / max_asa)
|
| 541 |
+
return np.clip(rsa, 0.0, 1.0)[:, None]
|
| 542 |
+
|
| 543 |
+
@torch.no_grad()
|
| 544 |
+
def _contact_count(self, traj: md.Trajectory, cutoff_nm=0.8):
|
| 545 |
+
L = traj.n_residues
|
| 546 |
+
ca_atoms = traj.topology.select("name CA")
|
| 547 |
+
if len(ca_atoms) == L:
|
| 548 |
+
coors = traj.xyz[0, ca_atoms, :] # nm
|
| 549 |
+
else:
|
| 550 |
+
xyz = traj.xyz[0]
|
| 551 |
+
coors = []
|
| 552 |
+
for res in traj.topology.residues:
|
| 553 |
+
idxs = [a.index for a in res.atoms]
|
| 554 |
+
coors.append(xyz[idxs, :].mean(axis=0))
|
| 555 |
+
coors = np.array(coors, dtype=np.float32)
|
| 556 |
+
diff = coors[:, None, :] - coors[None, :, :]
|
| 557 |
+
dist = np.sqrt((diff**2).sum(-1)) # nm
|
| 558 |
+
mask = (dist < cutoff_nm).astype(np.float32)
|
| 559 |
+
np.fill_diagonal(mask, 0.0)
|
| 560 |
+
cnt = mask.sum(axis=1)
|
| 561 |
+
return cnt[:, None].astype(np.float32)
|
| 562 |
+
|
| 563 |
+
@torch.no_grad()
|
| 564 |
+
def _plddt(self, pdb_file: str):
|
| 565 |
+
# 用 Biopython 读取 PDB 的 B-factor(ESMFold/AlphaFold 会把 pLDDT 写在这里)
|
| 566 |
+
from Bio.PDB import PDBParser
|
| 567 |
+
import numpy as np
|
| 568 |
+
|
| 569 |
+
parser = PDBParser(QUIET=True)
|
| 570 |
+
structure = parser.get_structure("prot", pdb_file)
|
| 571 |
+
model = structure[0]
|
| 572 |
+
|
| 573 |
+
res_plddt = []
|
| 574 |
+
for chain in model:
|
| 575 |
+
for residue in chain:
|
| 576 |
+
atoms = list(residue.get_atoms())
|
| 577 |
+
if len(atoms) == 0:
|
| 578 |
+
res_plddt.append(0.0)
|
| 579 |
+
continue
|
| 580 |
+
# 该残基原子 B-factor 的均值
|
| 581 |
+
bvals = [float(atom.get_bfactor()) for atom in atoms]
|
| 582 |
+
res_plddt.append(float(np.mean(bvals)))
|
| 583 |
+
|
| 584 |
+
# 归一化到 [0,1]
|
| 585 |
+
plddt = np.array(res_plddt, dtype=np.float32) / 100.0
|
| 586 |
+
plddt = np.clip(plddt, 0.0, 1.0)
|
| 587 |
+
return plddt[:, None] # [L,1]
|
| 588 |
+
|
| 589 |
+
@torch.no_grad()
|
| 590 |
+
def _parse_and_features(self, pdb_file: str):
|
| 591 |
+
traj = md.load(pdb_file)
|
| 592 |
+
L = traj.n_residues
|
| 593 |
+
|
| 594 |
+
angles = self._angles(traj) # [L,6]
|
| 595 |
+
ss8 = self._ss8(traj) # [L,8]
|
| 596 |
+
rsa = self._rsa(traj) # [L,1]
|
| 597 |
+
cnt = self._contact_count(traj) # [L,1]
|
| 598 |
+
plddt = self._plddt(pdb_file) # [L,1]
|
| 599 |
+
|
| 600 |
+
feats = np.concatenate([angles, ss8, rsa, cnt, plddt], axis=1).astype(np.float32) # [L,17]
|
| 601 |
+
|
| 602 |
+
ca_atoms = traj.topology.select("name CA")
|
| 603 |
+
if len(ca_atoms) == L:
|
| 604 |
+
coors_nm = traj.xyz[0, ca_atoms, :]
|
| 605 |
+
else:
|
| 606 |
+
xyz = traj.xyz[0]
|
| 607 |
+
res_coords = []
|
| 608 |
+
for res in traj.topology.residues:
|
| 609 |
+
idxs = [a.index for a in res.atoms]
|
| 610 |
+
res_coords.append(xyz[idxs, :].mean(axis=0))
|
| 611 |
+
coors_nm = np.array(res_coords, dtype=np.float32)
|
| 612 |
+
coors_ang = coors_nm * 10.0 # nm -> Å
|
| 613 |
+
return coors_ang.astype(np.float32), feats # [L,3], [L,17]
|
| 614 |
+
|
| 615 |
+
@torch.no_grad()
|
| 616 |
+
def forward(self, pdb_file: str):
|
| 617 |
+
coors_ang, scalars = self._parse_and_features(pdb_file)
|
| 618 |
+
coors = torch.tensor(coors_ang, dtype=torch.float32, device=self.device) # [N,3]
|
| 619 |
+
scalars = torch.tensor(scalars, dtype=torch.float32, device=self.device) # [N,17]
|
| 620 |
+
|
| 621 |
+
return scalars, coors # [N,17], [N,3]
|
| 622 |
+
|
| 623 |
+
class ResiduePipelineWithHFESM:
|
| 624 |
+
def __init__(self,
|
| 625 |
+
esm_model_name="facebook/esmfold_v1",
|
| 626 |
+
cache_dir=None,
|
| 627 |
+
esm_device='cpu',
|
| 628 |
+
allow_tf32=True
|
| 629 |
+
):
|
| 630 |
+
self.esm = ESMFoldPredictorHF(esm_model_name, cache_dir, esm_device, allow_tf32)
|
| 631 |
+
self.struct_encoder = StructureFeatureExtractorNoDSSP(device=esm_device)
|
| 632 |
+
self.cache_dir = cache_dir
|
| 633 |
+
|
| 634 |
+
@torch.no_grad()
|
| 635 |
+
def __call__(self, seq: str, save_pdb_path: str = None) -> torch.Tensor:
|
| 636 |
+
pdb_str = self.esm.infer_pdb_str(seq)
|
| 637 |
+
if save_pdb_path is None:
|
| 638 |
+
tmpdir = self.cache_dir if self.cache_dir is not None else tempfile.gettempdir()
|
| 639 |
+
save_pdb_path = str(Path(tmpdir) / "esmfold_pred_fold5.pdb")
|
| 640 |
+
Path(save_pdb_path).write_text(pdb_str)
|
| 641 |
+
|
| 642 |
+
struct_emb, struct_coords = self.struct_encoder(save_pdb_path)
|
| 643 |
+
return struct_emb, struct_coords
|
| 644 |
+
|
| 645 |
+
def sanitize_protein_seq(seq: str) -> str:
|
| 646 |
+
if not isinstance(seq, str):
|
| 647 |
+
return ""
|
| 648 |
+
s = "".join(seq.split()).upper()
|
| 649 |
+
allowed = set("ACDEFGHIKLMNPQRSTVWYXBZJUO")
|
| 650 |
+
return "".join([c for c in s if c in allowed])
|
| 651 |
+
|
| 652 |
+
@torch.no_grad()
|
| 653 |
+
def batch_embed_to_dicts(
|
| 654 |
+
df: pd.DataFrame,
|
| 655 |
+
seq_col: str,
|
| 656 |
+
pipeline,
|
| 657 |
+
show_progress: bool = True,
|
| 658 |
+
) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], List[Tuple[str, str]]]:
|
| 659 |
+
"""
|
| 660 |
+
Returns:
|
| 661 |
+
- emb_dict: {seq -> z(torch.Tensor[L, D])}
|
| 662 |
+
- coord_dict:{seq -> coords(torch.Tensor[L, 3])}
|
| 663 |
+
- failures: [(seq, err_msg), ...]
|
| 664 |
+
"""
|
| 665 |
+
|
| 666 |
+
raw_list = df[seq_col].astype(str).tolist()
|
| 667 |
+
seqs = []
|
| 668 |
+
for s in raw_list:
|
| 669 |
+
ss = sanitize_protein_seq(s)
|
| 670 |
+
if ss:
|
| 671 |
+
seqs.append(ss)
|
| 672 |
+
uniq_seqs = sorted(set(seqs))
|
| 673 |
+
|
| 674 |
+
logger.info(f"Total rows: {len(df)}, valid seqs: {len(seqs)}, unique: {len(uniq_seqs)}")
|
| 675 |
+
|
| 676 |
+
emb_dict: Dict[str, torch.Tensor] = {}
|
| 677 |
+
coord_dict: Dict[str, torch.Tensor] = {}
|
| 678 |
+
failures: List[Tuple[str, str]] = []
|
| 679 |
+
|
| 680 |
+
iterator = tqdm(uniq_seqs, desc="ESMfold Predicting structure...") if show_progress else uniq_seqs
|
| 681 |
+
for seq in tqdm(iterator):
|
| 682 |
+
if seq in emb_dict:
|
| 683 |
+
continue
|
| 684 |
+
try:
|
| 685 |
+
z_t, c_t = pipeline(seq) # z: [L, D], coords: [L, 3] (torch.Tensor)
|
| 686 |
+
emb_dict[seq] = z_t.detach().float().cpu()
|
| 687 |
+
coord_dict[seq] = c_t.detach().float().cpu()
|
| 688 |
+
except Exception as e:
|
| 689 |
+
failures.append((seq, repr(e)))
|
| 690 |
+
continue
|
| 691 |
+
|
| 692 |
+
logger.info(f"[DONE] OK: {len(emb_dict)}, Failed: {len(failures)}")
|
| 693 |
+
if failures[:3]:
|
| 694 |
+
logger.error("[SAMPLE failures]", failures[:3])
|
| 695 |
+
return emb_dict, coord_dict, failures
|
| 696 |
+
|
| 697 |
+
class ESMFoldEncoder(nn.Module):
|
| 698 |
+
def __init__(self, model_name="facebook/esmfold_v1", esm_cache_dir="esm_cache", cache_dir="cache"):
|
| 699 |
+
super(ESMFoldEncoder, self).__init__()
|
| 700 |
+
self.model_name = model_name
|
| 701 |
+
self.esm_cache_dir = esm_cache_dir
|
| 702 |
+
self.cache_dir = cache_dir
|
| 703 |
+
|
| 704 |
+
def save_obj(self, obj, path):
|
| 705 |
+
"""Save object to a file (no compression)."""
|
| 706 |
+
torch.save(obj, path)
|
| 707 |
+
|
| 708 |
+
def load_obj(self, path):
|
| 709 |
+
"""Load object from a file (no compression)."""
|
| 710 |
+
return torch.load(path, map_location='cpu', weights_only=False)
|
| 711 |
+
|
| 712 |
+
def load_esm_dict(self, device, df_data, chain, re_embed):
|
| 713 |
+
|
| 714 |
+
def _clean_unique(series: pd.Series) -> list:
|
| 715 |
+
cleaned = []
|
| 716 |
+
for s in series.astype(str).tolist():
|
| 717 |
+
ss = sanitize_protein_seq(s)
|
| 718 |
+
if ss:
|
| 719 |
+
cleaned.append(ss)
|
| 720 |
+
return sorted(set(cleaned))
|
| 721 |
+
|
| 722 |
+
def _retry_embed_df(
|
| 723 |
+
df: pd.DataFrame,
|
| 724 |
+
chain: str,
|
| 725 |
+
max_retries: int = 2,
|
| 726 |
+
show_progress: bool = True,
|
| 727 |
+
):
|
| 728 |
+
"""
|
| 729 |
+
Try to embed protein sequences with retries on failures.
|
| 730 |
+
|
| 731 |
+
Args:
|
| 732 |
+
df (pd.DataFrame): A DataFrame containing a column `chain` with sequences.
|
| 733 |
+
chain (str): The column name containing the sequences (e.g., "alpha", "beta").
|
| 734 |
+
pipeline: An embedding pipeline, should return (embedding, coords) for a sequence.
|
| 735 |
+
max_retries (int): Maximum number of retries for failed sequences.
|
| 736 |
+
show_progress (bool): Whether to display tqdm progress bars.
|
| 737 |
+
|
| 738 |
+
Returns:
|
| 739 |
+
feat_dict (Dict[str, torch.Tensor]): {sequence -> embedding tensor [L, D]}.
|
| 740 |
+
coord_dict (Dict[str, torch.Tensor]): {sequence -> coordinate tensor [L, 3]}.
|
| 741 |
+
failures (List[Tuple[str, str]]): List of (sequence, error_message) that still failed after retries.
|
| 742 |
+
"""
|
| 743 |
+
|
| 744 |
+
pipeline = ResiduePipelineWithHFESM(
|
| 745 |
+
esm_model_name=self.model_name,
|
| 746 |
+
cache_dir=self.esm_cache_dir,
|
| 747 |
+
esm_device=device
|
| 748 |
+
)
|
| 749 |
+
|
| 750 |
+
# 1. First attempt
|
| 751 |
+
feat_dict, coord_dict, failures = batch_embed_to_dicts(
|
| 752 |
+
df, chain, pipeline, show_progress=show_progress
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
# 2. Retry loop for failed sequences
|
| 756 |
+
tries = 0
|
| 757 |
+
while failures and tries < max_retries:
|
| 758 |
+
tries += 1
|
| 759 |
+
retry_seqs = [s for s, _ in failures]
|
| 760 |
+
logger.info(f"[retry {tries}/{max_retries}] {len(retry_seqs)} sequences")
|
| 761 |
+
retry_df = pd.DataFrame({chain: retry_seqs})
|
| 762 |
+
|
| 763 |
+
f2, c2, failures = batch_embed_to_dicts(
|
| 764 |
+
retry_df, chain, pipeline, show_progress=show_progress
|
| 765 |
+
)
|
| 766 |
+
feat_dict.update(f2)
|
| 767 |
+
coord_dict.update(c2)
|
| 768 |
+
|
| 769 |
+
return feat_dict, coord_dict, failures
|
| 770 |
+
|
| 771 |
+
def update_with_new_seqs(feat_dict, coord_dict, chain):
|
| 772 |
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
| 773 |
+
base_dir = base_dir + "/" + self.cache_dir
|
| 774 |
+
os.makedirs(base_dir, exist_ok=True)
|
| 775 |
+
path_feat = os.path.join(base_dir, f"{chain}_feat_dict.pt")
|
| 776 |
+
path_coords = os.path.join(base_dir, f"{chain}_coord_dict.pt")
|
| 777 |
+
|
| 778 |
+
all_seqs_clean = _clean_unique(df_data[chain])
|
| 779 |
+
new_seqs = [s for s in all_seqs_clean if s not in feat_dict]
|
| 780 |
+
if not new_seqs:
|
| 781 |
+
logger.info(f"No new {chain} sequences found")
|
| 782 |
+
return feat_dict, coord_dict
|
| 783 |
+
|
| 784 |
+
logger.info(f"Found new {chain} sequences, embedding...")
|
| 785 |
+
df_new = pd.DataFrame({chain: new_seqs})
|
| 786 |
+
new_feat_dict, new_coord_dict, failures = _retry_embed_df(df_new, chain, max_retries=100)
|
| 787 |
+
feat_dict.update(new_feat_dict)
|
| 788 |
+
coord_dict.update(new_coord_dict)
|
| 789 |
+
self.save_obj(feat_dict, path_feat)
|
| 790 |
+
self.save_obj(coord_dict, path_coords)
|
| 791 |
+
|
| 792 |
+
if failures:
|
| 793 |
+
for seq, err in failures:
|
| 794 |
+
logger.error(f"[create] failed: {seq} | {err}")
|
| 795 |
+
|
| 796 |
+
logger.info(f"Updated and saved {path_feat} and {path_coords}")
|
| 797 |
+
|
| 798 |
+
return feat_dict, coord_dict
|
| 799 |
+
|
| 800 |
+
def get_or_create_dict(chain):
|
| 801 |
+
base_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + self.cache_dir
|
| 802 |
+
os.makedirs(base_dir, exist_ok=True)
|
| 803 |
+
path_feat = os.path.join(base_dir, f"{chain}_feat_dict.pt")
|
| 804 |
+
path_coords = os.path.join(base_dir, f"{chain}_coord_dict.pt")
|
| 805 |
+
|
| 806 |
+
if os.path.exists(path_feat) and not re_embed:
|
| 807 |
+
logger.info(f"Loading {path_feat} and {path_coords}")
|
| 808 |
+
feat_dict = self.load_obj(path_feat)
|
| 809 |
+
coord_dict = self.load_obj(path_coords)
|
| 810 |
+
else:
|
| 811 |
+
logger.info(f"{path_feat} and {path_coords} not found or re_embed=True, generating...")
|
| 812 |
+
unique_seqs = _clean_unique(df_data[chain])
|
| 813 |
+
df_uniq = pd.DataFrame({chain: unique_seqs})
|
| 814 |
+
feat_dict, coord_dict, failures = _retry_embed_df(
|
| 815 |
+
df_uniq, chain, show_progress=True, max_retries=100
|
| 816 |
+
)
|
| 817 |
+
self.save_obj(feat_dict, path_feat)
|
| 818 |
+
self.save_obj(coord_dict, path_coords)
|
| 819 |
+
|
| 820 |
+
if failures:
|
| 821 |
+
for seq, err in failures:
|
| 822 |
+
logger.error(f"[create] failed: {seq} | {err}")
|
| 823 |
+
|
| 824 |
+
logger.info(f"Saved {path_feat} and {path_coords}")
|
| 825 |
+
|
| 826 |
+
return feat_dict, coord_dict
|
| 827 |
+
|
| 828 |
+
self.dict[chain+'_feat'], self.dict[chain+'_coord'] = update_with_new_seqs(*get_or_create_dict(chain), chain)
|
| 829 |
+
|
| 830 |
+
def pad_and_stack(self, batch_feats, L_max, batch_coors):
|
| 831 |
+
"""
|
| 832 |
+
batch_feats: list of [L_i, D] tensors
|
| 833 |
+
batch_coors: list of [L_i, 3] tensors
|
| 834 |
+
return:
|
| 835 |
+
feats: [B, L_max, D]
|
| 836 |
+
coors: [B, L_max, 3]
|
| 837 |
+
mask : [B, L_max] (True for real tokens)
|
| 838 |
+
"""
|
| 839 |
+
assert len(batch_feats) == len(batch_coors)
|
| 840 |
+
B = len(batch_feats)
|
| 841 |
+
D = batch_feats[0].shape[-1]
|
| 842 |
+
|
| 843 |
+
feats_pad = []
|
| 844 |
+
coors_pad = []
|
| 845 |
+
masks = []
|
| 846 |
+
|
| 847 |
+
for x, c in zip(batch_feats, batch_coors):
|
| 848 |
+
L = x.shape[0]
|
| 849 |
+
pad_L = L_max - L
|
| 850 |
+
# pad feats/coors with zeros
|
| 851 |
+
feats_pad.append(torch.nn.functional.pad(x, (0, 0, 0, pad_L))) # [L_max, D]
|
| 852 |
+
coors_pad.append(torch.nn.functional.pad(c, (0, 0, 0, pad_L))) # [L_max, 3]
|
| 853 |
+
m = torch.zeros(L_max, dtype=torch.bool)
|
| 854 |
+
m[:L] = True
|
| 855 |
+
masks.append(m)
|
| 856 |
+
|
| 857 |
+
feats = torch.stack(feats_pad, dim=0) # [B, L_max, D]
|
| 858 |
+
coors = torch.stack(coors_pad, dim=0) # [B, L_max, 3]
|
| 859 |
+
mask = torch.stack(masks, dim=0) # [B, L_max]
|
| 860 |
+
return feats, coors, mask
|
| 861 |
+
|
| 862 |
+
def forward(self, df_data, chain, device='cpu', re_embed=False):
|
| 863 |
+
"""
|
| 864 |
+
df_data: pd.DataFrame with a column `chain` containing sequences
|
| 865 |
+
chain: str, e.g. "alpha" or "beta"
|
| 866 |
+
device: str, e.g. 'cpu' or 'cuda:0'
|
| 867 |
+
re_embed: bool, whether to re-embed even if cached files exist
|
| 868 |
+
"""
|
| 869 |
+
self.dict = {}
|
| 870 |
+
self.load_esm_dict(device, df_data, chain, re_embed)
|
| 871 |
+
|
| 872 |
+
batch_feats = []
|
| 873 |
+
batch_coors = []
|
| 874 |
+
for seq in df_data[chain].astype(str).tolist():
|
| 875 |
+
ss = sanitize_protein_seq(seq)
|
| 876 |
+
if ss in self.dict[chain+'_feat'] and ss in self.dict[chain+'_coord']:
|
| 877 |
+
batch_feats.append(self.dict[chain+'_feat'][ss])
|
| 878 |
+
batch_coors.append(self.dict[chain+'_coord'][ss])
|
| 879 |
+
else:
|
| 880 |
+
raise ValueError(f"Sequence not found in embedding dict: {ss}")
|
| 881 |
+
|
| 882 |
+
# L_max = max(x.shape[0] for x in batch_feats)
|
| 883 |
+
|
| 884 |
+
return batch_feats, batch_coors
|
| 885 |
+
|
| 886 |
+
|
| 887 |
+
# =================================== Dataset / Collate ===========================================
|
| 888 |
+
class PepHLA_Dataset(torch.utils.data.Dataset):
|
| 889 |
+
def __init__(self, df, phys_dict, esm2_dict, struct_dict):
|
| 890 |
+
self.df = df
|
| 891 |
+
self.phys_dict = phys_dict
|
| 892 |
+
self.esm2_dict = esm2_dict
|
| 893 |
+
self.struct_dict = struct_dict
|
| 894 |
+
|
| 895 |
+
def __len__(self):
|
| 896 |
+
return len(self.df)
|
| 897 |
+
|
| 898 |
+
def __getitem__(self, idx):
|
| 899 |
+
row = self.df.iloc[idx]
|
| 900 |
+
pep = row['peptide']
|
| 901 |
+
hla = row['HLA_full']
|
| 902 |
+
label = torch.tensor(row['label'], dtype=torch.float32)
|
| 903 |
+
|
| 904 |
+
pep_phys = self.phys_dict['pep'][pep]
|
| 905 |
+
pep_esm = self.esm2_dict['pep'][pep]
|
| 906 |
+
|
| 907 |
+
hla_phys = self.phys_dict['hla'][hla]
|
| 908 |
+
hla_esm = self.esm2_dict['hla'][hla]
|
| 909 |
+
hla_struct, hla_coord = self.struct_dict[hla]
|
| 910 |
+
|
| 911 |
+
return {
|
| 912 |
+
'pep_phys': pep_phys,
|
| 913 |
+
'pep_esm': pep_esm,
|
| 914 |
+
'hla_phys': hla_phys,
|
| 915 |
+
'hla_esm': hla_esm,
|
| 916 |
+
'hla_struct': hla_struct,
|
| 917 |
+
'hla_coord': hla_coord,
|
| 918 |
+
'label': label,
|
| 919 |
+
'pep_id': pep,
|
| 920 |
+
'hla_id': hla,
|
| 921 |
+
}
|
| 922 |
+
|
| 923 |
+
def peptide_hla_collate_fn(batch):
|
| 924 |
+
def pad_or_crop(x, original_len, target_len):
|
| 925 |
+
L, D = x.shape
|
| 926 |
+
valid_len = min(original_len, target_len)
|
| 927 |
+
valid_part = x[:valid_len]
|
| 928 |
+
if valid_len < target_len:
|
| 929 |
+
pad_len = target_len - valid_len
|
| 930 |
+
padding = x.new_zeros(pad_len, D)
|
| 931 |
+
return torch.cat([valid_part, padding], dim=0)
|
| 932 |
+
else:
|
| 933 |
+
return valid_part
|
| 934 |
+
|
| 935 |
+
out_batch = {}
|
| 936 |
+
|
| 937 |
+
pep_lens = [len(item['pep_id']) for item in batch]
|
| 938 |
+
max_pep_len = max(pep_lens)
|
| 939 |
+
|
| 940 |
+
for key in batch[0].keys():
|
| 941 |
+
if key == 'label':
|
| 942 |
+
out_batch[key] = torch.stack([item[key] for item in batch])
|
| 943 |
+
elif key.startswith('pep_') and not key.endswith('_id'):
|
| 944 |
+
out_batch[key] = torch.stack([pad_or_crop(item[key], len(item['pep_id']), max_pep_len) for item in batch])
|
| 945 |
+
elif key.endswith('_id'):
|
| 946 |
+
out_batch[key] = [item[key] for item in batch]
|
| 947 |
+
else:
|
| 948 |
+
out_batch[key] = torch.stack([item[key] for item in batch])
|
| 949 |
+
|
| 950 |
+
def make_mask(lengths, max_len):
|
| 951 |
+
masks = []
|
| 952 |
+
for L in lengths:
|
| 953 |
+
m = torch.zeros(max_len, dtype=torch.bool)
|
| 954 |
+
m[:L] = True
|
| 955 |
+
masks.append(m)
|
| 956 |
+
return torch.stack(masks)
|
| 957 |
+
|
| 958 |
+
out_batch['pep_mask'] = make_mask(pep_lens, max_pep_len)
|
| 959 |
+
return out_batch
|
| 960 |
+
|
| 961 |
+
# =================================== Dataset / Collate ===========================================
|
| 962 |
+
class TCRPepHLA_Dataset(torch.utils.data.Dataset):
|
| 963 |
+
"""
|
| 964 |
+
Dataset for TCRα + TCRβ + peptide + HLA binding.
|
| 965 |
+
"""
|
| 966 |
+
def __init__(self, df, phys_dict, esm2_dict, struct_dict, pep_hla_feat_dict):
|
| 967 |
+
self.df = df
|
| 968 |
+
self.phys_dict = phys_dict
|
| 969 |
+
self.esm2_dict = esm2_dict
|
| 970 |
+
self.struct_dict = struct_dict
|
| 971 |
+
self.pep_hla_feat_dict = pep_hla_feat_dict
|
| 972 |
+
|
| 973 |
+
def __len__(self):
|
| 974 |
+
return len(self.df)
|
| 975 |
+
|
| 976 |
+
def __getitem__(self, idx):
|
| 977 |
+
row = self.df.iloc[idx]
|
| 978 |
+
tcra = row['tcra']
|
| 979 |
+
tcrb = row['tcrb']
|
| 980 |
+
pep = row['peptide']
|
| 981 |
+
hla = row['HLA_full']
|
| 982 |
+
label = torch.tensor(row['label'], dtype=torch.float32)
|
| 983 |
+
|
| 984 |
+
# ---- TCRα ----
|
| 985 |
+
tcra_phys = self.phys_dict['tcra'][tcra]
|
| 986 |
+
tcra_esm = self.esm2_dict['tcra'][tcra]
|
| 987 |
+
tcra_struct, tcra_coord = self.struct_dict['tcra'][tcra]
|
| 988 |
+
tcra_cdr3_start = torch.tensor(row['cdr3a_start'], dtype=torch.long)
|
| 989 |
+
tcra_cdr3_end = torch.tensor(row['cdr3a_end'], dtype=torch.long)
|
| 990 |
+
|
| 991 |
+
# ---- TCRβ ----
|
| 992 |
+
tcrb_phys = self.phys_dict['tcrb'][tcrb]
|
| 993 |
+
tcrb_esm = self.esm2_dict['tcrb'][tcrb]
|
| 994 |
+
tcrb_struct, tcrb_coord = self.struct_dict['tcrb'][tcrb]
|
| 995 |
+
tcrb_cdr3_start = torch.tensor(row['cdr3b_start'], dtype=torch.long)
|
| 996 |
+
tcrb_cdr3_end = torch.tensor(row['cdr3b_end'], dtype=torch.long)
|
| 997 |
+
|
| 998 |
+
# ---- peptide ----
|
| 999 |
+
pep_phys = self.phys_dict['pep'][pep]
|
| 1000 |
+
pep_esm = self.esm2_dict['pep'][pep]
|
| 1001 |
+
pep_struct, pep_coord = self.struct_dict['pep'][pep]
|
| 1002 |
+
|
| 1003 |
+
# ---- HLA ----
|
| 1004 |
+
hla_phys = self.phys_dict['hla'][hla]
|
| 1005 |
+
hla_esm = self.esm2_dict['hla'][hla]
|
| 1006 |
+
hla_struct, hla_coord = self.struct_dict['hla'][hla]
|
| 1007 |
+
|
| 1008 |
+
feats = self.pep_hla_feat_dict[(pep, hla)]
|
| 1009 |
+
pep_feat_pretrain = feats['pep_feat_pretrain']
|
| 1010 |
+
hla_feat_pretrain = feats['hla_feat_pretrain']
|
| 1011 |
+
|
| 1012 |
+
return {
|
| 1013 |
+
# TCRα
|
| 1014 |
+
'tcra_phys': tcra_phys,
|
| 1015 |
+
'tcra_esm': tcra_esm,
|
| 1016 |
+
'tcra_struct': tcra_struct,
|
| 1017 |
+
'tcra_coord': tcra_coord,
|
| 1018 |
+
'cdr3a_start': tcra_cdr3_start,
|
| 1019 |
+
'cdr3a_end': tcra_cdr3_end,
|
| 1020 |
+
|
| 1021 |
+
# TCRβ
|
| 1022 |
+
'tcrb_phys': tcrb_phys,
|
| 1023 |
+
'tcrb_esm': tcrb_esm,
|
| 1024 |
+
'tcrb_struct': tcrb_struct,
|
| 1025 |
+
'tcrb_coord': tcrb_coord,
|
| 1026 |
+
'cdr3b_start': tcrb_cdr3_start,
|
| 1027 |
+
'cdr3b_end': tcrb_cdr3_end,
|
| 1028 |
+
|
| 1029 |
+
# peptide
|
| 1030 |
+
'pep_phys': pep_phys,
|
| 1031 |
+
'pep_esm': pep_esm,
|
| 1032 |
+
'pep_struct': pep_struct,
|
| 1033 |
+
'pep_coord': pep_coord,
|
| 1034 |
+
|
| 1035 |
+
# HLA
|
| 1036 |
+
'hla_phys': hla_phys,
|
| 1037 |
+
'hla_esm': hla_esm,
|
| 1038 |
+
'hla_struct': hla_struct,
|
| 1039 |
+
'hla_coord': hla_coord,
|
| 1040 |
+
|
| 1041 |
+
'tcra_id': tcra,
|
| 1042 |
+
'tcrb_id': tcrb,
|
| 1043 |
+
'pep_id': pep,
|
| 1044 |
+
'hla_id': hla,
|
| 1045 |
+
'label': label,
|
| 1046 |
+
|
| 1047 |
+
'pep_feat_pretrain': pep_feat_pretrain,
|
| 1048 |
+
'hla_feat_pretrain': hla_feat_pretrain,
|
| 1049 |
+
}
|
| 1050 |
+
|
| 1051 |
+
# =================================== Collate Function ===========================================
|
| 1052 |
+
def tcr_pep_hla_collate_fn(batch):
|
| 1053 |
+
def pad_or_crop(x, original_len, target_len):
|
| 1054 |
+
L, D = x.shape
|
| 1055 |
+
valid_len = min(original_len, target_len)
|
| 1056 |
+
valid_part = x[:valid_len]
|
| 1057 |
+
if valid_len < target_len:
|
| 1058 |
+
pad_len = target_len - valid_len
|
| 1059 |
+
padding = x.new_zeros(pad_len, D)
|
| 1060 |
+
return torch.cat([valid_part, padding], dim=0)
|
| 1061 |
+
else:
|
| 1062 |
+
return valid_part
|
| 1063 |
+
|
| 1064 |
+
out_batch = {}
|
| 1065 |
+
|
| 1066 |
+
tcra_lens = [len(item['tcra_id']) for item in batch]
|
| 1067 |
+
tcrb_lens = [len(item['tcrb_id']) for item in batch]
|
| 1068 |
+
pep_lens = [len(item['pep_id']) for item in batch]
|
| 1069 |
+
|
| 1070 |
+
max_tcra_len = max(tcra_lens)
|
| 1071 |
+
max_tcrb_len = max(tcrb_lens)
|
| 1072 |
+
max_pep_len = max(pep_lens)
|
| 1073 |
+
|
| 1074 |
+
for key in batch[0].keys():
|
| 1075 |
+
if key == 'label':
|
| 1076 |
+
out_batch[key] = torch.stack([item[key] for item in batch])
|
| 1077 |
+
|
| 1078 |
+
elif key.startswith('tcra_') and not key.endswith('_id'):
|
| 1079 |
+
out_batch[key] = torch.stack([pad_or_crop(item[key], len(item['tcra_id']), max_tcra_len) for item in batch])
|
| 1080 |
+
|
| 1081 |
+
elif key.startswith('tcrb_') and not key.endswith('_id'):
|
| 1082 |
+
out_batch[key] = torch.stack([pad_or_crop(item[key], len(item['tcrb_id']), max_tcrb_len) for item in batch])
|
| 1083 |
+
|
| 1084 |
+
elif key.startswith('pep_') and not key.endswith('_id'):
|
| 1085 |
+
out_batch[key] = torch.stack([pad_or_crop(item[key], len(item['pep_id']), max_pep_len) for item in batch])
|
| 1086 |
+
|
| 1087 |
+
elif key.endswith('_id'):
|
| 1088 |
+
out_batch[key] = [item[key] for item in batch]
|
| 1089 |
+
|
| 1090 |
+
else:
|
| 1091 |
+
out_batch[key] = torch.stack([item[key] for item in batch])
|
| 1092 |
+
|
| 1093 |
+
def make_mask(lengths, max_len):
|
| 1094 |
+
masks = []
|
| 1095 |
+
for L in lengths:
|
| 1096 |
+
m = torch.zeros(max_len, dtype=torch.bool)
|
| 1097 |
+
m[:L] = True
|
| 1098 |
+
masks.append(m)
|
| 1099 |
+
return torch.stack(masks)
|
| 1100 |
+
|
| 1101 |
+
out_batch['tcra_mask'] = make_mask(tcra_lens, max_tcra_len)
|
| 1102 |
+
out_batch['tcrb_mask'] = make_mask(tcrb_lens, max_tcrb_len)
|
| 1103 |
+
out_batch['pep_mask'] = make_mask(pep_lens, max_pep_len)
|
| 1104 |
+
|
| 1105 |
+
return out_batch
|
| 1106 |
+
|
| 1107 |
+
# ==================================== 小积木:投影 + 门控 =========================================
|
| 1108 |
+
class ResidueProjector(nn.Module):
|
| 1109 |
+
"""把不同分支的通道维度对齐到同一 D"""
|
| 1110 |
+
def __init__(self, in_dim, out_dim):
|
| 1111 |
+
super().__init__()
|
| 1112 |
+
self.proj = nn.Linear(in_dim, out_dim) if in_dim != out_dim else nn.Identity()
|
| 1113 |
+
def forward(self, x): # x: [B,L,Di]
|
| 1114 |
+
return self.proj(x)
|
| 1115 |
+
|
| 1116 |
+
class ResidueDoubleFusion(nn.Module):
|
| 1117 |
+
"""
|
| 1118 |
+
ResidueDoubleFusion:
|
| 1119 |
+
A residue-level two-branch fusion module that combines two modalities (x1, x2)
|
| 1120 |
+
using cross-attention followed by gated residual fusion and linear projection.
|
| 1121 |
+
|
| 1122 |
+
Typical usage:
|
| 1123 |
+
- x1: physicochemical features
|
| 1124 |
+
- x2: ESM embeddings (or structure features)
|
| 1125 |
+
"""
|
| 1126 |
+
def __init__(self, dim, num_heads=8, dropout=0.1):
|
| 1127 |
+
super().__init__()
|
| 1128 |
+
self.dim = dim
|
| 1129 |
+
|
| 1130 |
+
# Cross-attention: allows information flow between two modalities
|
| 1131 |
+
self.cross_attn = nn.MultiheadAttention(
|
| 1132 |
+
embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True
|
| 1133 |
+
)
|
| 1134 |
+
|
| 1135 |
+
# Gating mechanism: adaptively weight two modalities per residue
|
| 1136 |
+
self.gate = nn.Sequential(
|
| 1137 |
+
nn.Linear(dim * 2, dim),
|
| 1138 |
+
nn.ReLU(),
|
| 1139 |
+
nn.Linear(dim, 1),
|
| 1140 |
+
nn.Sigmoid()
|
| 1141 |
+
)
|
| 1142 |
+
|
| 1143 |
+
# Optional projection after fusion
|
| 1144 |
+
self.out_proj = nn.Linear(dim, dim)
|
| 1145 |
+
|
| 1146 |
+
# Layer norms for stable training
|
| 1147 |
+
self.norm_x1 = nn.LayerNorm(dim)
|
| 1148 |
+
self.norm_x2 = nn.LayerNorm(dim)
|
| 1149 |
+
self.norm_out = nn.LayerNorm(dim)
|
| 1150 |
+
|
| 1151 |
+
def forward(self, x1, x2):
|
| 1152 |
+
"""
|
| 1153 |
+
Args:
|
| 1154 |
+
x1: Tensor [B, L, D] - first modality (e.g., physicochemical)
|
| 1155 |
+
x2: Tensor [B, L, D] - second modality (e.g., ESM embeddings)
|
| 1156 |
+
Returns:
|
| 1157 |
+
fused: Tensor [B, L, D] - fused residue-level representation
|
| 1158 |
+
"""
|
| 1159 |
+
|
| 1160 |
+
# 1) Normalize both branches
|
| 1161 |
+
x1_norm = self.norm_x1(x1)
|
| 1162 |
+
x2_norm = self.norm_x2(x2)
|
| 1163 |
+
|
| 1164 |
+
# 2) Cross-attention (x1 queries, x2 keys/values)
|
| 1165 |
+
# This allows x1 to attend to x2 at each residue position
|
| 1166 |
+
attn_out, _ = self.cross_attn(
|
| 1167 |
+
query=x1_norm,
|
| 1168 |
+
key=x2_norm,
|
| 1169 |
+
value=x2_norm
|
| 1170 |
+
) # [B, L, D]
|
| 1171 |
+
|
| 1172 |
+
# 3) Gating between original x1 and attention-enhanced x2
|
| 1173 |
+
gate_val = self.gate(torch.cat([x1, attn_out], dim=-1)) # [B, L, 1]
|
| 1174 |
+
fused = gate_val * x1 + (1 - gate_val) * attn_out
|
| 1175 |
+
|
| 1176 |
+
# 4) Optional projection + normalization
|
| 1177 |
+
fused = self.out_proj(fused)
|
| 1178 |
+
fused = self.norm_out(fused)
|
| 1179 |
+
|
| 1180 |
+
return fused
|
| 1181 |
+
|
| 1182 |
+
class ResidueTripleFusion(nn.Module):
|
| 1183 |
+
"""
|
| 1184 |
+
ResidueTripleFusion:
|
| 1185 |
+
A hierarchical three-branch feature fusion module for residue-level representations.
|
| 1186 |
+
|
| 1187 |
+
Step 1: Fuse physicochemical features and protein language model embeddings.
|
| 1188 |
+
Step 2: Fuse the intermediate representation with structure-based features.
|
| 1189 |
+
|
| 1190 |
+
Each fusion step uses ResidueDoubleFusion (cross-attention + gating + linear projection).
|
| 1191 |
+
"""
|
| 1192 |
+
def __init__(self, dim, num_heads=8, dropout=0.1):
|
| 1193 |
+
super().__init__()
|
| 1194 |
+
# Fuse physicochemical + ESM embeddings
|
| 1195 |
+
self.fuse_phys_esm = ResidueDoubleFusion(dim, num_heads=num_heads, dropout=dropout)
|
| 1196 |
+
# Fuse the fused phys+esm representation with structure embeddings
|
| 1197 |
+
self.fuse_f12_struct = ResidueDoubleFusion(dim, num_heads=num_heads, dropout=dropout)
|
| 1198 |
+
|
| 1199 |
+
def forward(self, phys, esm, struct):
|
| 1200 |
+
"""
|
| 1201 |
+
Args:
|
| 1202 |
+
phys: Tensor [B, L, D], physicochemical features (e.g., AAindex-based)
|
| 1203 |
+
esm: Tensor [B, L, D], protein language model embeddings (e.g., ESM2, ProtT5)
|
| 1204 |
+
struct: Tensor [B, L, D], structure-derived features (e.g., torsion, RSA)
|
| 1205 |
+
|
| 1206 |
+
Returns:
|
| 1207 |
+
fused: Tensor [B, L, D], final fused representation
|
| 1208 |
+
"""
|
| 1209 |
+
# Step 1: Fuse physicochemical and ESM embeddings
|
| 1210 |
+
f12 = self.fuse_phys_esm(phys, esm)
|
| 1211 |
+
|
| 1212 |
+
# Step 2: Fuse the intermediate fused representation with structure features
|
| 1213 |
+
fused = self.fuse_f12_struct(f12, struct)
|
| 1214 |
+
|
| 1215 |
+
return fused
|
| 1216 |
+
|
| 1217 |
+
class BANLayer(nn.Module):
|
| 1218 |
+
"""
|
| 1219 |
+
Bilinear Attention Network Layer with proper 2D masked-softmax.
|
| 1220 |
+
v_mask: [B, L_v] True=valid
|
| 1221 |
+
q_mask: [B, L_q] True=valid
|
| 1222 |
+
"""
|
| 1223 |
+
def __init__(self, v_dim, q_dim, h_dim, h_out, act='ReLU', dropout=0.2, k=3):
|
| 1224 |
+
super().__init__()
|
| 1225 |
+
self.c = 32
|
| 1226 |
+
self.k = k
|
| 1227 |
+
self.v_dim = v_dim
|
| 1228 |
+
self.q_dim = q_dim
|
| 1229 |
+
self.h_dim = h_dim
|
| 1230 |
+
self.h_out = h_out
|
| 1231 |
+
|
| 1232 |
+
self.v_net = FCNet([v_dim, h_dim * self.k], act=act, dropout=dropout)
|
| 1233 |
+
self.q_net = FCNet([q_dim, h_dim * self.k], act=act, dropout=dropout)
|
| 1234 |
+
|
| 1235 |
+
if 1 < k:
|
| 1236 |
+
self.p_net = nn.AvgPool1d(self.k, stride=self.k)
|
| 1237 |
+
|
| 1238 |
+
if h_out <= self.c:
|
| 1239 |
+
self.h_mat = nn.Parameter(torch.Tensor(1, h_out, 1, h_dim * self.k).normal_())
|
| 1240 |
+
self.h_bias = nn.Parameter(torch.Tensor(1, h_out, 1, 1).normal_())
|
| 1241 |
+
else:
|
| 1242 |
+
self.h_net = weight_norm(nn.Linear(h_dim * self.k, h_out), dim=None)
|
| 1243 |
+
|
| 1244 |
+
self.bn = nn.BatchNorm1d(h_dim)
|
| 1245 |
+
|
| 1246 |
+
def attention_pooling(self, v, q, att_map): # att_map: [B, L_v, L_q]
|
| 1247 |
+
logits = torch.einsum('bvk,bvq,bqk->bk', (v, att_map, q))
|
| 1248 |
+
if 1 < self.k:
|
| 1249 |
+
logits = self.p_net(logits.unsqueeze(1)).squeeze(1) * self.k
|
| 1250 |
+
return logits
|
| 1251 |
+
|
| 1252 |
+
def _masked_softmax_2d(self, logits, v_mask, q_mask):
|
| 1253 |
+
"""
|
| 1254 |
+
logits: [B, h_out, L_v, L_q]
|
| 1255 |
+
v_mask: [B, L_v] or None
|
| 1256 |
+
q_mask: [B, L_q] or None
|
| 1257 |
+
return: probs [B, h_out, L_v, L_q] (masked entries=0, 在有效的二维子矩阵内归一化)
|
| 1258 |
+
"""
|
| 1259 |
+
B, H, Lv, Lq = logits.shape
|
| 1260 |
+
device = logits.device
|
| 1261 |
+
if v_mask is None:
|
| 1262 |
+
v_mask = torch.ones(B, Lv, dtype=torch.bool, device=device)
|
| 1263 |
+
if q_mask is None:
|
| 1264 |
+
q_mask = torch.ones(B, Lq, dtype=torch.bool, device=device)
|
| 1265 |
+
|
| 1266 |
+
mask2d = (v_mask[:, :, None] & q_mask[:, None, :]) # [B, Lv, Lq]
|
| 1267 |
+
mask2d = mask2d[:, None, :, :].expand(B, H, Lv, Lq) # [B, H, Lv, Lq]
|
| 1268 |
+
|
| 1269 |
+
logits = logits.masked_fill(~mask2d, -float('inf'))
|
| 1270 |
+
|
| 1271 |
+
# 在 Lv*Lq 的联合空间做 softmax
|
| 1272 |
+
flat = logits.view(B, H, -1) # [B, H, Lv*Lq]
|
| 1273 |
+
# 处理极端情况:某些样本可能无有效格子,避免 NaN
|
| 1274 |
+
flat = torch.where(torch.isinf(flat), torch.full_like(flat, -1e9), flat)
|
| 1275 |
+
flat = F.softmax(flat, dim=-1)
|
| 1276 |
+
flat = torch.nan_to_num(flat, nan=0.0) # 安全兜底
|
| 1277 |
+
probs = flat.view(B, H, Lv, Lq)
|
| 1278 |
+
|
| 1279 |
+
# 把被 mask 的位置清零(数值稳定 & 便于可视化)
|
| 1280 |
+
probs = probs * mask2d.float()
|
| 1281 |
+
return probs
|
| 1282 |
+
|
| 1283 |
+
def forward(self, v, q, v_mask=None, q_mask=None, softmax=True):
|
| 1284 |
+
"""
|
| 1285 |
+
v: [B, L_v, Dv], q: [B, L_q, Dq]
|
| 1286 |
+
"""
|
| 1287 |
+
B, L_v, _ = v.size()
|
| 1288 |
+
_, L_q, _ = q.size()
|
| 1289 |
+
|
| 1290 |
+
v_ = self.v_net(v) # [B, L_v, h_dim*k]
|
| 1291 |
+
q_ = self.q_net(q) # [B, L_q, h_dim*k]
|
| 1292 |
+
|
| 1293 |
+
if self.h_out <= self.c:
|
| 1294 |
+
att_maps = torch.einsum('xhyk,bvk,bqk->bhvq', (self.h_mat, v_, q_)) + self.h_bias # [B,H,Lv,Lq]
|
| 1295 |
+
else:
|
| 1296 |
+
v_t = v_.transpose(1, 2).unsqueeze(3) # [B, K, Lv, 1]
|
| 1297 |
+
q_t = q_.transpose(1, 2).unsqueeze(2) # [B, K, 1, Lq]
|
| 1298 |
+
d_ = torch.matmul(v_t, q_t) # [B, K, Lv, Lq]
|
| 1299 |
+
att_maps = self.h_net(d_.permute(0, 2, 3, 1)) # [B, Lv, Lq, H]
|
| 1300 |
+
att_maps = att_maps.permute(0, 3, 1, 2) # [B, H, Lv, Lq]
|
| 1301 |
+
|
| 1302 |
+
if softmax:
|
| 1303 |
+
att_maps = self._masked_softmax_2d(att_maps, v_mask, q_mask)
|
| 1304 |
+
else:
|
| 1305 |
+
# 即使不 softmax,也把无效格子清 0,避免泄漏
|
| 1306 |
+
if v_mask is not None:
|
| 1307 |
+
att_maps = att_maps.masked_fill(~v_mask[:, None, :, None], 0.0)
|
| 1308 |
+
if q_mask is not None:
|
| 1309 |
+
att_maps = att_maps.masked_fill(~q_mask[:, None, None, :], 0.0)
|
| 1310 |
+
|
| 1311 |
+
# 注意:此时 v_ / q_ 仍是 [B, L, K],与 att_maps 的 [B,H,Lv,Lq] 对齐
|
| 1312 |
+
logits = self.attention_pooling(v_, q_, att_maps[:, 0, :, :])
|
| 1313 |
+
for i in range(1, self.h_out):
|
| 1314 |
+
logits = logits + self.attention_pooling(v_, q_, att_maps[:, i, :, :])
|
| 1315 |
+
|
| 1316 |
+
logits = self.bn(logits)
|
| 1317 |
+
return logits, att_maps
|
| 1318 |
+
|
| 1319 |
+
class FCNet(nn.Module):
|
| 1320 |
+
def __init__(self, dims, act='ReLU', dropout=0.2):
|
| 1321 |
+
super(FCNet, self).__init__()
|
| 1322 |
+
|
| 1323 |
+
layers = []
|
| 1324 |
+
for i in range(len(dims) - 2):
|
| 1325 |
+
in_dim = dims[i]
|
| 1326 |
+
out_dim = dims[i + 1]
|
| 1327 |
+
if 0 < dropout:
|
| 1328 |
+
layers.append(nn.Dropout(dropout))
|
| 1329 |
+
layers.append(weight_norm(nn.Linear(in_dim, out_dim), dim=None))
|
| 1330 |
+
if '' != act:
|
| 1331 |
+
layers.append(getattr(nn, act)())
|
| 1332 |
+
if 0 < dropout:
|
| 1333 |
+
layers.append(nn.Dropout(dropout))
|
| 1334 |
+
layers.append(weight_norm(nn.Linear(dims[-2], dims[-1]), dim=None))
|
| 1335 |
+
if '' != act:
|
| 1336 |
+
layers.append(getattr(nn, act)())
|
| 1337 |
+
|
| 1338 |
+
self.main = nn.Sequential(*layers)
|
| 1339 |
+
|
| 1340 |
+
def forward(self, x):
|
| 1341 |
+
return self.main(x)
|
| 1342 |
+
|
| 1343 |
+
class StackedEGNN(nn.Module):
|
| 1344 |
+
def __init__(self, dim, layers, update_coors=False, **egnn_kwargs):
|
| 1345 |
+
super().__init__()
|
| 1346 |
+
self.layers = nn.ModuleList([
|
| 1347 |
+
EGNN(dim=dim, update_coors=update_coors, **egnn_kwargs)
|
| 1348 |
+
for _ in range(layers)
|
| 1349 |
+
])
|
| 1350 |
+
|
| 1351 |
+
def forward(self, feats, coors, mask=None):
|
| 1352 |
+
# feats: [B, L_max, D], coors: [B, L_max, 3], mask: [B, L_max] (bool)
|
| 1353 |
+
for layer in self.layers:
|
| 1354 |
+
feats, coors = layer(feats, coors, mask=mask)
|
| 1355 |
+
return feats, coors
|
| 1356 |
+
|
| 1357 |
+
class FocalLoss(nn.Module):
|
| 1358 |
+
def __init__(self, alpha=0.5, gamma=2, reduction='mean'):
|
| 1359 |
+
super(FocalLoss, self).__init__()
|
| 1360 |
+
self.alpha = alpha
|
| 1361 |
+
self.gamma = gamma
|
| 1362 |
+
self.reduction = reduction
|
| 1363 |
+
|
| 1364 |
+
def forward(self, inputs, targets):
|
| 1365 |
+
bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
|
| 1366 |
+
p_t = torch.exp(-bce_loss)
|
| 1367 |
+
|
| 1368 |
+
alpha_weight = self.alpha * targets + (1 - self.alpha) * (1 - targets)
|
| 1369 |
+
loss = alpha_weight * (1 - p_t) ** self.gamma * bce_loss
|
| 1370 |
+
|
| 1371 |
+
if self.reduction == 'mean':
|
| 1372 |
+
return torch.mean(loss)
|
| 1373 |
+
elif self.reduction == 'sum':
|
| 1374 |
+
return torch.sum(loss)
|
| 1375 |
+
else:
|
| 1376 |
+
return loss
|
| 1377 |
+
|
| 1378 |
+
# ===================================== 主模型(完全版) ===========================================
|
| 1379 |
+
class PeptideHLABindingPredictor(nn.Module):
|
| 1380 |
+
def __init__(
|
| 1381 |
+
self,
|
| 1382 |
+
phys_dim=20, # 物化编码的输出维度(你定义的 PhysicochemicalEncoder)
|
| 1383 |
+
pep_dim=256, # 统一后的 peptide 通道
|
| 1384 |
+
hla_dim=256, # 统一后的 HLA 通道
|
| 1385 |
+
bilinear_dim=256,
|
| 1386 |
+
pseudo_seq_pos=None, # 口袋位点(假定 0-based 且落在 [0,179])
|
| 1387 |
+
device="cuda:0",
|
| 1388 |
+
loss_fn='bce',
|
| 1389 |
+
alpha=0.5,
|
| 1390 |
+
gamma=2.0,
|
| 1391 |
+
dropout=0.2,
|
| 1392 |
+
pos_weights=None
|
| 1393 |
+
):
|
| 1394 |
+
super().__init__()
|
| 1395 |
+
self.device = device
|
| 1396 |
+
self.pep_dim = pep_dim
|
| 1397 |
+
self.hla_dim = hla_dim
|
| 1398 |
+
self.bilinear_dim = bilinear_dim
|
| 1399 |
+
self.alpha = alpha
|
| 1400 |
+
self.gamma = gamma
|
| 1401 |
+
self.dropout = dropout
|
| 1402 |
+
if loss_fn == 'bce':
|
| 1403 |
+
self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weights]) if pos_weights is not None else None)
|
| 1404 |
+
elif loss_fn == 'focal':
|
| 1405 |
+
self.loss_fn = FocalLoss(alpha=alpha, gamma=gamma)
|
| 1406 |
+
else:
|
| 1407 |
+
raise ValueError(f"Unknown loss function: {loss_fn}")
|
| 1408 |
+
|
| 1409 |
+
self.se3_model = StackedEGNN(
|
| 1410 |
+
dim=17, layers=3
|
| 1411 |
+
)
|
| 1412 |
+
|
| 1413 |
+
self.max_pep_len = 20
|
| 1414 |
+
self.max_hla_len = 180
|
| 1415 |
+
|
| 1416 |
+
self.pep_pos_embed = nn.Parameter(torch.randn(self.max_pep_len, pep_dim))
|
| 1417 |
+
self.hla_pos_embed = nn.Parameter(torch.randn(self.max_hla_len, hla_dim))
|
| 1418 |
+
|
| 1419 |
+
# —— 分支投影到统一维度(逐残基)——
|
| 1420 |
+
# peptide 分支(Physicochem -> pep_dim, ESM2(1280) -> pep_dim)
|
| 1421 |
+
self.proj_pep_phys = ResidueProjector(in_dim=phys_dim, out_dim=pep_dim) # 你的 PhysEnc 输出维设成 pep_dim
|
| 1422 |
+
self.proj_pep_esm = ResidueProjector(in_dim=1280, out_dim=pep_dim)
|
| 1423 |
+
|
| 1424 |
+
# HLA 分支(Physicochem -> hla_dim, ESM2(1280) -> hla_dim, Struct(17/或se3_out) -> hla_dim)
|
| 1425 |
+
self.proj_hla_phys = ResidueProjector(in_dim=phys_dim, out_dim=hla_dim) # 你的 PhysEnc 输出维设成 hla_dim
|
| 1426 |
+
self.proj_hla_esm = ResidueProjector(in_dim=1280, out_dim=hla_dim)
|
| 1427 |
+
self.proj_hla_se3 = ResidueProjector(in_dim=17, out_dim=hla_dim) # 让 se3_model 输出维就是 hla_dim
|
| 1428 |
+
|
| 1429 |
+
# —— 门控融合(逐残基)——
|
| 1430 |
+
self.gate_pep = ResidueDoubleFusion(pep_dim) # pep_phys × pep_esm
|
| 1431 |
+
self.gate_hla = ResidueTripleFusion(hla_dim) # hla_phys × hla_esm × hla_struct
|
| 1432 |
+
|
| 1433 |
+
d_model = self.pep_dim
|
| 1434 |
+
n_heads = 8
|
| 1435 |
+
|
| 1436 |
+
# 1. 用于 "Peptide 查询 HLA" (pep_q_hla_kv)
|
| 1437 |
+
self.cross_attn_pep_hla = nn.MultiheadAttention(
|
| 1438 |
+
embed_dim=d_model,
|
| 1439 |
+
num_heads=n_heads,
|
| 1440 |
+
dropout=self.dropout,
|
| 1441 |
+
batch_first=True
|
| 1442 |
+
)
|
| 1443 |
+
self.norm_cross_pep = nn.LayerNorm(d_model)
|
| 1444 |
+
|
| 1445 |
+
# 2. 用于 "HLA 查询 Peptide" (hla_q_pep_kv)
|
| 1446 |
+
self.cross_attn_hla_pep = nn.MultiheadAttention(
|
| 1447 |
+
embed_dim=d_model,
|
| 1448 |
+
num_heads=n_heads,
|
| 1449 |
+
dropout=self.dropout,
|
| 1450 |
+
batch_first=True
|
| 1451 |
+
)
|
| 1452 |
+
self.norm_cross_hla = nn.LayerNorm(d_model)
|
| 1453 |
+
|
| 1454 |
+
# —— 交互模块(Bilinear attention map)——
|
| 1455 |
+
self.bi_attn = BANLayer(v_dim=pep_dim, q_dim=hla_dim, h_dim=bilinear_dim, h_out=4, k=3)
|
| 1456 |
+
|
| 1457 |
+
# —— 头部 ——
|
| 1458 |
+
self.head = nn.Sequential(
|
| 1459 |
+
nn.Linear(bilinear_dim, bilinear_dim),
|
| 1460 |
+
nn.ReLU(),
|
| 1461 |
+
nn.Linear(bilinear_dim, 1)
|
| 1462 |
+
)
|
| 1463 |
+
|
| 1464 |
+
# —— 口袋位点 ——
|
| 1465 |
+
if pseudo_seq_pos is None:
|
| 1466 |
+
pseudo_seq_pos = [i-2 for i in [7, 9, 24, 45, 59, 62, 63, 66, 67, 69, 70, 73, 74, 76, 77, 80, 81, 84, 95, 97, 99, 114, 116, 118, 143, 147, 150, 152, 156, 158, 159, 163, 167, 171]]
|
| 1467 |
+
self.register_buffer("contact_idx", torch.tensor(pseudo_seq_pos, dtype=torch.long))
|
| 1468 |
+
|
| 1469 |
+
# --------------------------------------------
|
| 1470 |
+
# Transformer Encoders for peptide & HLA
|
| 1471 |
+
# --------------------------------------------
|
| 1472 |
+
encoder_layer_pep = TransformerEncoderLayer(
|
| 1473 |
+
d_model=pep_dim, # 输入维度
|
| 1474 |
+
nhead=8, # 注意力头数(可调)
|
| 1475 |
+
dim_feedforward=pep_dim*4,
|
| 1476 |
+
dropout=self.dropout,
|
| 1477 |
+
batch_first=True # 输入形状 [B,L,D]
|
| 1478 |
+
)
|
| 1479 |
+
self.pep_encoder = TransformerEncoder(encoder_layer_pep, num_layers=2) # 可以调整层数
|
| 1480 |
+
|
| 1481 |
+
encoder_layer_hla = TransformerEncoderLayer(
|
| 1482 |
+
d_model=hla_dim,
|
| 1483 |
+
nhead=8,
|
| 1484 |
+
dim_feedforward=hla_dim*4,
|
| 1485 |
+
dropout=self.dropout,
|
| 1486 |
+
batch_first=True
|
| 1487 |
+
)
|
| 1488 |
+
self.hla_encoder = TransformerEncoder(encoder_layer_hla, num_layers=1)
|
| 1489 |
+
|
| 1490 |
+
# -------------------------- 工具:把 list of [L,D] pad 成 [B,L_max,D] --------------------------
|
| 1491 |
+
def _pad_stack(self, tensors, L_max=None):
|
| 1492 |
+
Ls = [t.shape[0] for t in tensors]
|
| 1493 |
+
if L_max is None: L_max = max(Ls)
|
| 1494 |
+
D = tensors[0].shape[-1]
|
| 1495 |
+
B = len(tensors)
|
| 1496 |
+
out = tensors[0].new_zeros((B, L_max, D))
|
| 1497 |
+
mask = torch.zeros(B, L_max, dtype=torch.bool, device=out.device)
|
| 1498 |
+
for i, t in enumerate(tensors):
|
| 1499 |
+
L = t.shape[0]
|
| 1500 |
+
out[i, :L] = t
|
| 1501 |
+
mask[i, :L] = True
|
| 1502 |
+
return out, mask # [B,L_max,D], [B,L_max]
|
| 1503 |
+
|
| 1504 |
+
# ----------------------------------- 口袋掩码 --------------------------------------
|
| 1505 |
+
|
| 1506 |
+
def _mask_to_pockets(self, hla_feat):
|
| 1507 |
+
"""
|
| 1508 |
+
从 HLA 特征中只保留 pocket 位点,返回:
|
| 1509 |
+
- hla_pocket: [B, n_pocket, D]
|
| 1510 |
+
- pocket_mask: [B, n_pocket] (全部 True)
|
| 1511 |
+
"""
|
| 1512 |
+
B, L, D = hla_feat.shape
|
| 1513 |
+
|
| 1514 |
+
# ensure idx in [0, L-1]
|
| 1515 |
+
idx = self.contact_idx.clamp(min=0, max=L-1)
|
| 1516 |
+
# gather pocket features
|
| 1517 |
+
hla_pocket = hla_feat[:, idx, :] # [B, n_pocket, D]
|
| 1518 |
+
|
| 1519 |
+
return hla_pocket
|
| 1520 |
+
|
| 1521 |
+
def add_positional_encoding(self, x, pos_embed):
|
| 1522 |
+
"""
|
| 1523 |
+
x: [B, L, D]
|
| 1524 |
+
pos_embed: [L_max, D]
|
| 1525 |
+
"""
|
| 1526 |
+
B, L, D = x.shape
|
| 1527 |
+
# 截取前 L 个位置编码
|
| 1528 |
+
pe = pos_embed[:L, :].unsqueeze(0).expand(B, -1, -1) # [B, L, D]
|
| 1529 |
+
return x + pe
|
| 1530 |
+
|
| 1531 |
+
def forward(self, batch):
|
| 1532 |
+
# take batch from DataLoader
|
| 1533 |
+
pep_phys = batch['pep_phys'].to(self.device, non_blocking=True)
|
| 1534 |
+
pep_esm = batch['pep_esm'].to(self.device, non_blocking=True)
|
| 1535 |
+
hla_phys = batch['hla_phys'].to(self.device, non_blocking=True)
|
| 1536 |
+
hla_esm = batch['hla_esm'].to(self.device, non_blocking=True)
|
| 1537 |
+
hla_struct = batch['hla_struct'].to(self.device, non_blocking=True)
|
| 1538 |
+
hla_coord = batch['hla_coord'].to(self.device, non_blocking=True)
|
| 1539 |
+
labels = batch['label'].to(self.device)
|
| 1540 |
+
|
| 1541 |
+
# 1) peptide 物化 + ESM2 → gate 融合
|
| 1542 |
+
pep_phys = self.proj_pep_phys(pep_phys)
|
| 1543 |
+
pep_esm = self.proj_pep_esm(pep_esm)
|
| 1544 |
+
pep_feat = self.gate_pep(pep_phys, pep_esm) # [B, Lp, D]
|
| 1545 |
+
|
| 1546 |
+
pep_feat = self.add_positional_encoding(pep_feat, self.pep_pos_embed)
|
| 1547 |
+
pep_feat = self.pep_encoder(pep_feat, src_key_padding_mask=~batch['pep_mask'].to(self.device, non_blocking=True))
|
| 1548 |
+
|
| 1549 |
+
# 2) HLA 物化 + ESM2 + 结构 → SE3 → gate 融合
|
| 1550 |
+
hla_phys = self.proj_hla_phys(hla_phys)
|
| 1551 |
+
hla_esm = self.proj_hla_esm(hla_esm)
|
| 1552 |
+
# hla_struct 是 [B, 180, 17],先过 SE3
|
| 1553 |
+
hla_se3 = self.se3_model(hla_struct, hla_coord, None)[0] # [B, 180, 17]
|
| 1554 |
+
hla_se3 = self.proj_hla_se3(hla_se3) # →256
|
| 1555 |
+
hla_feat = self.gate_hla(hla_phys, hla_esm, hla_se3)
|
| 1556 |
+
|
| 1557 |
+
hla_feat = self.add_positional_encoding(hla_feat, self.hla_pos_embed)
|
| 1558 |
+
hla_feat = self.hla_encoder(hla_feat)
|
| 1559 |
+
|
| 1560 |
+
# cross attention for pep
|
| 1561 |
+
pep_feat_cross, _ = self.cross_attn_pep_hla(
|
| 1562 |
+
query=pep_feat,
|
| 1563 |
+
key=hla_feat,
|
| 1564 |
+
value=hla_feat,
|
| 1565 |
+
key_padding_mask=None
|
| 1566 |
+
)
|
| 1567 |
+
|
| 1568 |
+
# cross attention for hla
|
| 1569 |
+
hla_feat_cross, _ = self.cross_attn_hla_pep(
|
| 1570 |
+
query=hla_feat,
|
| 1571 |
+
key=pep_feat,
|
| 1572 |
+
value=pep_feat,
|
| 1573 |
+
key_padding_mask=~batch['pep_mask'].to(self.device, non_blocking=True)
|
| 1574 |
+
)
|
| 1575 |
+
|
| 1576 |
+
pep_feat_updated = self.norm_cross_pep(pep_feat + pep_feat_cross)
|
| 1577 |
+
hla_feat_updated = self.norm_cross_hla(hla_feat + hla_feat_cross)
|
| 1578 |
+
|
| 1579 |
+
# 3) mask HLA 口袋位点
|
| 1580 |
+
hla_pocket = self._mask_to_pockets(hla_feat_updated)
|
| 1581 |
+
|
| 1582 |
+
# 4) bilinear attention
|
| 1583 |
+
fused_vec, attn = self.bi_attn(
|
| 1584 |
+
pep_feat_updated,
|
| 1585 |
+
hla_pocket,
|
| 1586 |
+
v_mask=batch['pep_mask'].to(self.device, non_blocking=True),
|
| 1587 |
+
q_mask=None
|
| 1588 |
+
)
|
| 1589 |
+
logits = self.head(fused_vec).squeeze(-1)
|
| 1590 |
+
|
| 1591 |
+
probs = torch.sigmoid(logits).detach().cpu().numpy()
|
| 1592 |
+
|
| 1593 |
+
binding_loss = self.loss_fn(logits, labels.float())
|
| 1594 |
+
|
| 1595 |
+
return probs, binding_loss, attn.detach().cpu().numpy().sum(axis=1), fused_vec.detach().cpu().numpy()
|
| 1596 |
+
|
| 1597 |
+
# -------------------------- 编码器复用接口(给 TCR-HLA 模型用) --------------------------
|
| 1598 |
+
def _pad_peptide(self, x, max_len):
|
| 1599 |
+
"""Pad peptide feature tensor [1, L, D] to [1, max_len, D]."""
|
| 1600 |
+
B, L, D = x.shape
|
| 1601 |
+
if L < max_len:
|
| 1602 |
+
pad = x.new_zeros(B, max_len - L, D)
|
| 1603 |
+
return torch.cat([x, pad], dim=1)
|
| 1604 |
+
else:
|
| 1605 |
+
return x[:, :max_len, :]
|
| 1606 |
+
|
| 1607 |
+
@torch.no_grad()
|
| 1608 |
+
def encode_peptide_hla(self, pep_id, pep_phys, pep_esm, hla_phys, hla_esm, hla_struct, hla_coord, max_pep_len):
|
| 1609 |
+
Lp = len(pep_id)
|
| 1610 |
+
|
| 1611 |
+
pep_phys = self.proj_pep_phys(pep_phys)
|
| 1612 |
+
pep_esm = self.proj_pep_esm(pep_esm)
|
| 1613 |
+
|
| 1614 |
+
pep_phys = self._pad_peptide(pep_phys, max_pep_len)
|
| 1615 |
+
pep_esm = self._pad_peptide(pep_esm, max_pep_len)
|
| 1616 |
+
|
| 1617 |
+
device = pep_phys.device
|
| 1618 |
+
pep_mask = torch.zeros(1, max_pep_len, dtype=torch.bool, device=device)
|
| 1619 |
+
pep_mask[0, :Lp] = True
|
| 1620 |
+
|
| 1621 |
+
pep_feat = self.gate_pep(pep_phys, pep_esm)
|
| 1622 |
+
pep_feat = self.add_positional_encoding(pep_feat, self.pep_pos_embed)
|
| 1623 |
+
pep_feat = self.pep_encoder(pep_feat, src_key_padding_mask=~pep_mask)
|
| 1624 |
+
|
| 1625 |
+
# 2) hla encoding
|
| 1626 |
+
hla_phys = self.proj_hla_phys(hla_phys)
|
| 1627 |
+
hla_esm = self.proj_hla_esm(hla_esm)
|
| 1628 |
+
hla_se3 = self.se3_model(hla_struct, hla_coord, None)[0]
|
| 1629 |
+
hla_se3 = self.proj_hla_se3(hla_se3)
|
| 1630 |
+
hla_feat = self.gate_hla(hla_phys, hla_esm, hla_se3)
|
| 1631 |
+
hla_feat = self.add_positional_encoding(hla_feat, self.hla_pos_embed)
|
| 1632 |
+
hla_feat = self.hla_encoder(hla_feat)
|
| 1633 |
+
|
| 1634 |
+
# --- 3a. Peptide (Q) 查询 HLA (K, V) ---
|
| 1635 |
+
pep_feat_cross, _ = self.cross_attn_pep_hla(
|
| 1636 |
+
query=pep_feat,
|
| 1637 |
+
key=hla_feat,
|
| 1638 |
+
value=hla_feat,
|
| 1639 |
+
key_padding_mask=None
|
| 1640 |
+
)
|
| 1641 |
+
pep_feat_updated = self.norm_cross_pep(pep_feat + pep_feat_cross)
|
| 1642 |
+
|
| 1643 |
+
# --- 3b. HLA (Q) 查询 Peptide (K, V) ---
|
| 1644 |
+
hla_feat_cross, _ = self.cross_attn_hla_pep(
|
| 1645 |
+
query=hla_feat,
|
| 1646 |
+
key=pep_feat,
|
| 1647 |
+
value=pep_feat,
|
| 1648 |
+
key_padding_mask=~pep_mask
|
| 1649 |
+
)
|
| 1650 |
+
hla_feat_updated = self.norm_cross_hla(hla_feat + hla_feat_cross)
|
| 1651 |
+
|
| 1652 |
+
return pep_feat_updated, hla_feat_updated
|
| 1653 |
+
|
| 1654 |
+
class TCRPeptideHLABindingPredictor(nn.Module):
|
| 1655 |
+
def __init__(
|
| 1656 |
+
self,
|
| 1657 |
+
tcr_dim=256,
|
| 1658 |
+
pep_dim=256,
|
| 1659 |
+
hla_dim=256,
|
| 1660 |
+
bilinear_dim=256,
|
| 1661 |
+
loss_fn='bce',
|
| 1662 |
+
alpha=0.5,
|
| 1663 |
+
gamma=2.0,
|
| 1664 |
+
dropout=0.1,
|
| 1665 |
+
device='cuda:0',
|
| 1666 |
+
pos_weights=None
|
| 1667 |
+
):
|
| 1668 |
+
super().__init__()
|
| 1669 |
+
|
| 1670 |
+
# TCR α / β position embeddings
|
| 1671 |
+
self.max_tcra_len = 500
|
| 1672 |
+
self.max_tcrb_len = 500
|
| 1673 |
+
self.max_pep_len = 20
|
| 1674 |
+
self.max_hla_len = 180
|
| 1675 |
+
self.alpha = alpha
|
| 1676 |
+
self.gamma = gamma
|
| 1677 |
+
self.dropout = dropout
|
| 1678 |
+
|
| 1679 |
+
if loss_fn == 'bce':
|
| 1680 |
+
self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weights]) if pos_weights is not None else None)
|
| 1681 |
+
elif loss_fn == 'focal':
|
| 1682 |
+
self.loss_fn = FocalLoss(alpha=alpha, gamma=gamma)
|
| 1683 |
+
else:
|
| 1684 |
+
raise ValueError(f"Unknown loss function: {loss_fn}")
|
| 1685 |
+
|
| 1686 |
+
self.tcra_pos_embed = nn.Parameter(torch.randn(self.max_tcra_len, tcr_dim))
|
| 1687 |
+
self.tcrb_pos_embed = nn.Parameter(torch.randn(self.max_tcrb_len, tcr_dim))
|
| 1688 |
+
self.pep_pos_embed = nn.Parameter(torch.randn(self.max_pep_len, pep_dim))
|
| 1689 |
+
self.hla_pos_embed = nn.Parameter(torch.randn(self.max_hla_len, hla_dim))
|
| 1690 |
+
|
| 1691 |
+
self.device = device
|
| 1692 |
+
self.tcr_dim = tcr_dim
|
| 1693 |
+
self.pep_dim = pep_dim
|
| 1694 |
+
self.hla_dim = hla_dim
|
| 1695 |
+
self.bilinear_dim = bilinear_dim
|
| 1696 |
+
|
| 1697 |
+
d_model = tcr_dim
|
| 1698 |
+
n_heads = 8
|
| 1699 |
+
|
| 1700 |
+
self.cross_attn_tcra_pep = nn.MultiheadAttention(d_model, n_heads, dropout=self.dropout, batch_first=True)
|
| 1701 |
+
self.cross_attn_tcra_hla = nn.MultiheadAttention(d_model, n_heads, dropout=self.dropout, batch_first=True)
|
| 1702 |
+
self.cross_attn_tcrb_pep = nn.MultiheadAttention(d_model, n_heads, dropout=self.dropout, batch_first=True)
|
| 1703 |
+
self.cross_attn_tcrb_hla = nn.MultiheadAttention(d_model, n_heads, dropout=self.dropout, batch_first=True)
|
| 1704 |
+
self.norm_tcra_pep = nn.LayerNorm(d_model)
|
| 1705 |
+
self.norm_tcra_hla = nn.LayerNorm(d_model)
|
| 1706 |
+
self.norm_tcrb_pep = nn.LayerNorm(d_model)
|
| 1707 |
+
self.norm_tcrb_hla = nn.LayerNorm(d_model)
|
| 1708 |
+
|
| 1709 |
+
# =======================
|
| 1710 |
+
# TCRα / TCRβ encoders
|
| 1711 |
+
# =======================
|
| 1712 |
+
def make_tcr_encoder():
|
| 1713 |
+
proj_phys = ResidueProjector(20, tcr_dim)
|
| 1714 |
+
proj_esm = ResidueProjector(1280, tcr_dim)
|
| 1715 |
+
proj_struct = ResidueProjector(17, tcr_dim)
|
| 1716 |
+
se3 = StackedEGNN(dim=17, layers=1)
|
| 1717 |
+
gate = ResidueTripleFusion(tcr_dim)
|
| 1718 |
+
encoder_layer = TransformerEncoderLayer(
|
| 1719 |
+
d_model=tcr_dim, nhead=8, dim_feedforward=tcr_dim*4, dropout=self.dropout, batch_first=True
|
| 1720 |
+
)
|
| 1721 |
+
encoder = TransformerEncoder(encoder_layer, num_layers=2)
|
| 1722 |
+
return nn.ModuleDict(dict(
|
| 1723 |
+
proj_phys=proj_phys, proj_esm=proj_esm, proj_struct=proj_struct,
|
| 1724 |
+
se3=se3, gate=gate, encoder=encoder
|
| 1725 |
+
))
|
| 1726 |
+
|
| 1727 |
+
self.tcra_enc = make_tcr_encoder()
|
| 1728 |
+
self.tcrb_enc = make_tcr_encoder()
|
| 1729 |
+
|
| 1730 |
+
# =======================
|
| 1731 |
+
# Peptide encoder (phys + esm + structure)
|
| 1732 |
+
# =======================
|
| 1733 |
+
self.proj_pep_phys = ResidueProjector(20, pep_dim)
|
| 1734 |
+
self.proj_pep_esm = ResidueProjector(1280, pep_dim)
|
| 1735 |
+
self.proj_pep_struct = ResidueProjector(17, pep_dim)
|
| 1736 |
+
self.pep_se3 = StackedEGNN(dim=17, layers=1)
|
| 1737 |
+
self.pep_gate = ResidueTripleFusion(pep_dim)
|
| 1738 |
+
pep_encoder_layer = TransformerEncoderLayer(
|
| 1739 |
+
d_model=pep_dim, nhead=8, dim_feedforward=pep_dim*4, dropout=self.dropout, batch_first=True
|
| 1740 |
+
)
|
| 1741 |
+
self.pep_encoder = TransformerEncoder(pep_encoder_layer, num_layers=2)
|
| 1742 |
+
|
| 1743 |
+
# =======================
|
| 1744 |
+
# HLA encoder
|
| 1745 |
+
# =======================
|
| 1746 |
+
self.proj_hla_phys = ResidueProjector(20, hla_dim)
|
| 1747 |
+
self.proj_hla_esm = ResidueProjector(1280, hla_dim)
|
| 1748 |
+
self.proj_hla_struct = ResidueProjector(17, hla_dim)
|
| 1749 |
+
self.hla_se3 = StackedEGNN(dim=17, layers=1)
|
| 1750 |
+
self.hla_gate = ResidueTripleFusion(hla_dim)
|
| 1751 |
+
hla_encoder_layer = TransformerEncoderLayer(
|
| 1752 |
+
d_model=hla_dim, nhead=8, dim_feedforward=hla_dim*4, dropout=self.dropout, batch_first=True
|
| 1753 |
+
)
|
| 1754 |
+
self.hla_encoder = TransformerEncoder(hla_encoder_layer, num_layers=1)
|
| 1755 |
+
|
| 1756 |
+
self.pep_gate_2 = ResidueDoubleFusion(pep_dim)
|
| 1757 |
+
self.hla_gate_2 = ResidueDoubleFusion(hla_dim)
|
| 1758 |
+
|
| 1759 |
+
# =======================
|
| 1760 |
+
# Bilinear interactions
|
| 1761 |
+
# =======================
|
| 1762 |
+
self.bi_tcra_pep = BANLayer(tcr_dim, pep_dim, bilinear_dim, h_out=4, k=3)
|
| 1763 |
+
self.bi_tcrb_pep = BANLayer(tcr_dim, pep_dim, bilinear_dim, h_out=4, k=3)
|
| 1764 |
+
self.bi_tcra_hla = BANLayer(tcr_dim, hla_dim, bilinear_dim, h_out=4, k=3)
|
| 1765 |
+
self.bi_tcrb_hla = BANLayer(tcr_dim, hla_dim, bilinear_dim, h_out=4, k=3)
|
| 1766 |
+
|
| 1767 |
+
# =======================
|
| 1768 |
+
# Head
|
| 1769 |
+
# =======================
|
| 1770 |
+
total_fused_dim = bilinear_dim * 4
|
| 1771 |
+
self.head = nn.Sequential(
|
| 1772 |
+
nn.Linear(total_fused_dim, bilinear_dim),
|
| 1773 |
+
nn.ReLU(),
|
| 1774 |
+
nn.Linear(bilinear_dim, 1)
|
| 1775 |
+
)
|
| 1776 |
+
|
| 1777 |
+
def encode_tcr(self, x_phys, x_esm, x_struct, x_coord, x_mask, enc, pos_embed):
|
| 1778 |
+
phys = enc['proj_phys'](x_phys)
|
| 1779 |
+
esm = enc['proj_esm'](x_esm)
|
| 1780 |
+
se3 = enc['se3'](x_struct, x_coord, None)[0]
|
| 1781 |
+
se3 = enc['proj_struct'](se3)
|
| 1782 |
+
feat = enc['gate'](phys, esm, se3)
|
| 1783 |
+
feat = self.add_positional_encoding(feat, pos_embed)
|
| 1784 |
+
feat = enc['encoder'](feat, src_key_padding_mask=~x_mask)
|
| 1785 |
+
return feat
|
| 1786 |
+
|
| 1787 |
+
def add_positional_encoding(self, x, pos_embed):
|
| 1788 |
+
"""
|
| 1789 |
+
x: [B, L, D]
|
| 1790 |
+
pos_embed: [L_max, D]
|
| 1791 |
+
"""
|
| 1792 |
+
B, L, D = x.shape
|
| 1793 |
+
pe = pos_embed[:L, :].unsqueeze(0).expand(B, -1, -1)
|
| 1794 |
+
return x + pe
|
| 1795 |
+
|
| 1796 |
+
# def _extract_cdr3_segment(self, tcr_feat, cdr3_start, cdr3_end):
|
| 1797 |
+
# B, L, D = tcr_feat.shape
|
| 1798 |
+
# device = tcr_feat.device
|
| 1799 |
+
|
| 1800 |
+
# max_len = (cdr3_end - cdr3_start + 1).max().item()
|
| 1801 |
+
|
| 1802 |
+
# # [max_len], 0..max_len-1
|
| 1803 |
+
# rel_idx = torch.arange(max_len, device=device).unsqueeze(0).expand(B, -1) # [B, max_len]
|
| 1804 |
+
# # absolute index = start + rel_idx
|
| 1805 |
+
# abs_idx = cdr3_start.unsqueeze(1) + rel_idx
|
| 1806 |
+
# # clamp end
|
| 1807 |
+
# abs_idx = abs_idx.clamp(0, L-1)
|
| 1808 |
+
|
| 1809 |
+
# # mask positions beyond end
|
| 1810 |
+
# mask = rel_idx <= (cdr3_end - cdr3_start).unsqueeze(1)
|
| 1811 |
+
|
| 1812 |
+
# # gather
|
| 1813 |
+
# # expand abs_idx to [B, max_len, D] for gather
|
| 1814 |
+
# gather_idx = abs_idx.unsqueeze(-1).expand(-1, -1, D)
|
| 1815 |
+
# out = torch.gather(tcr_feat, 1, gather_idx) # [B, max_len, D]
|
| 1816 |
+
|
| 1817 |
+
# return out, mask
|
| 1818 |
+
|
| 1819 |
+
def _extract_cdr3_segment(self, tcr_feat, cdr3_start, cdr3_end):
|
| 1820 |
+
"""
|
| 1821 |
+
Extracts CDR3 embeddings and corresponding mask.
|
| 1822 |
+
tcr_feat: [B, L, D]
|
| 1823 |
+
cdr3_start, cdr3_end: [B]
|
| 1824 |
+
Returns:
|
| 1825 |
+
out: [B, max_len, D]
|
| 1826 |
+
mask: [B, max_len] (True = valid)
|
| 1827 |
+
"""
|
| 1828 |
+
B, L, D = tcr_feat.shape
|
| 1829 |
+
device = tcr_feat.device
|
| 1830 |
+
|
| 1831 |
+
# 每个样本的 cdr3 长度
|
| 1832 |
+
lens = (cdr3_end - cdr3_start).clamp(min=0)
|
| 1833 |
+
max_len = lens.max().item()
|
| 1834 |
+
|
| 1835 |
+
rel_idx = torch.arange(max_len, device=device).unsqueeze(0).expand(B, -1) # [B, max_len]
|
| 1836 |
+
abs_idx = cdr3_start.unsqueeze(1) + rel_idx # [B, max_len]
|
| 1837 |
+
|
| 1838 |
+
# mask: True 表示有效
|
| 1839 |
+
mask = rel_idx < lens.unsqueeze(1) # 注意这里 "<" 就够了
|
| 1840 |
+
|
| 1841 |
+
# 将超出范围的索引设为 0(任意有效索引都行,因为会被mask掉)
|
| 1842 |
+
abs_idx = torch.where(mask, abs_idx, torch.zeros_like(abs_idx))
|
| 1843 |
+
|
| 1844 |
+
# gather
|
| 1845 |
+
gather_idx = abs_idx.unsqueeze(-1).expand(-1, -1, D)
|
| 1846 |
+
out = torch.gather(tcr_feat, 1, gather_idx)
|
| 1847 |
+
|
| 1848 |
+
# 对 mask 为 False 的位置强制置零,避免无效 token 参与计算
|
| 1849 |
+
out = out * mask.unsqueeze(-1)
|
| 1850 |
+
|
| 1851 |
+
return out, mask
|
| 1852 |
+
|
| 1853 |
+
def forward(self, batch):
|
| 1854 |
+
# TCRα / TCRβ
|
| 1855 |
+
tcra_feat = self.encode_tcr(
|
| 1856 |
+
batch['tcra_phys'].to(self.device, non_blocking=True),
|
| 1857 |
+
batch['tcra_esm'].to(self.device, non_blocking=True),
|
| 1858 |
+
batch['tcra_struct'].to(self.device, non_blocking=True),
|
| 1859 |
+
batch['tcra_coord'].to(self.device, non_blocking=True),
|
| 1860 |
+
batch['tcra_mask'].to(self.device, non_blocking=True),
|
| 1861 |
+
self.tcra_enc,
|
| 1862 |
+
self.tcra_pos_embed
|
| 1863 |
+
)
|
| 1864 |
+
tcrb_feat = self.encode_tcr(
|
| 1865 |
+
batch['tcrb_phys'].to(self.device, non_blocking=True),
|
| 1866 |
+
batch['tcrb_esm'].to(self.device, non_blocking=True),
|
| 1867 |
+
batch['tcrb_struct'].to(self.device, non_blocking=True),
|
| 1868 |
+
batch['tcrb_coord'].to(self.device, non_blocking=True),
|
| 1869 |
+
batch['tcrb_mask'].to(self.device, non_blocking=True),
|
| 1870 |
+
self.tcrb_enc,
|
| 1871 |
+
self.tcrb_pos_embed
|
| 1872 |
+
)
|
| 1873 |
+
# peptide
|
| 1874 |
+
pep_phys = self.proj_pep_phys(batch['pep_phys'].to(self.device, non_blocking=True))
|
| 1875 |
+
pep_esm = self.proj_pep_esm(batch['pep_esm'].to(self.device, non_blocking=True))
|
| 1876 |
+
pep_se3 = self.pep_se3(batch['pep_struct'].to(self.device, non_blocking=True), batch['pep_coord'].to(self.device, non_blocking=True), None)[0]
|
| 1877 |
+
pep_se3 = self.proj_pep_struct(pep_se3)
|
| 1878 |
+
pep_feat = self.pep_gate(pep_phys, pep_esm, pep_se3)
|
| 1879 |
+
pep_feat = self.add_positional_encoding(pep_feat, self.pep_pos_embed)
|
| 1880 |
+
pep_feat = self.pep_encoder(
|
| 1881 |
+
pep_feat,
|
| 1882 |
+
src_key_padding_mask=~batch['pep_mask'].to(self.device)
|
| 1883 |
+
)
|
| 1884 |
+
# hla
|
| 1885 |
+
hla_phys = self.proj_hla_phys(batch['hla_phys'].to(self.device, non_blocking=True))
|
| 1886 |
+
hla_esm = self.proj_hla_esm(batch['hla_esm'].to(self.device, non_blocking=True))
|
| 1887 |
+
hla_se3 = self.hla_se3(batch['hla_struct'].to(self.device, non_blocking=True), batch['hla_coord'].to(self.device, non_blocking=True), None)[0]
|
| 1888 |
+
hla_se3 = self.proj_hla_struct(hla_se3)
|
| 1889 |
+
hla_feat = self.hla_gate(hla_phys, hla_esm, hla_se3)
|
| 1890 |
+
hla_feat = self.add_positional_encoding(hla_feat, self.hla_pos_embed)
|
| 1891 |
+
hla_feat = self.hla_encoder(hla_feat)
|
| 1892 |
+
|
| 1893 |
+
if ('pep_feat_pretrain' in batch) and ('hla_feat_pretrain' in batch):
|
| 1894 |
+
pep_pretrain = batch['pep_feat_pretrain'].to(self.device, non_blocking=True)
|
| 1895 |
+
hla_pretrain = batch['hla_feat_pretrain'].to(self.device, non_blocking=True)
|
| 1896 |
+
|
| 1897 |
+
# ---- 鲁棒的长度对齐 (裁剪到最小长度) ----
|
| 1898 |
+
Lp = pep_feat.shape[1]
|
| 1899 |
+
Lp_pretrain = pep_pretrain.shape[1]
|
| 1900 |
+
if Lp != Lp_pretrain:
|
| 1901 |
+
Lp_min = min(Lp, Lp_pretrain)
|
| 1902 |
+
pep_feat = pep_feat[:, :Lp_min, :]
|
| 1903 |
+
pep_pretrain = pep_pretrain[:, :Lp_min, :]
|
| 1904 |
+
|
| 1905 |
+
Lh = hla_feat.shape[1]
|
| 1906 |
+
Lh_pretrain = hla_pretrain.shape[1]
|
| 1907 |
+
if Lh != Lh_pretrain:
|
| 1908 |
+
Lh_min = min(Lh, Lh_pretrain)
|
| 1909 |
+
hla_feat = hla_feat[:, :Lh_min, :]
|
| 1910 |
+
hla_pretrain = hla_pretrain[:, :Lh_min, :]
|
| 1911 |
+
|
| 1912 |
+
# ---- Peptide gating ----
|
| 1913 |
+
pep_feat = self.pep_gate_2(pep_feat, pep_pretrain)
|
| 1914 |
+
# ---- HLA gating ----
|
| 1915 |
+
hla_feat = self.hla_gate_2(hla_feat, hla_pretrain)
|
| 1916 |
+
|
| 1917 |
+
# TCRα CDR3 segment
|
| 1918 |
+
tcra_cdr3, cdr3a_mask = self._extract_cdr3_segment(
|
| 1919 |
+
tcra_feat,
|
| 1920 |
+
batch['cdr3a_start'].to(self.device, non_blocking=True),
|
| 1921 |
+
batch['cdr3a_end'].to(self.device, non_blocking=True)
|
| 1922 |
+
)
|
| 1923 |
+
|
| 1924 |
+
# TCRβ CDR3 segment
|
| 1925 |
+
tcrb_cdr3, cdr3b_mask = self._extract_cdr3_segment(
|
| 1926 |
+
tcrb_feat,
|
| 1927 |
+
batch['cdr3b_start'].to(self.device, non_blocking=True),
|
| 1928 |
+
batch['cdr3b_end'].to(self.device, non_blocking=True)
|
| 1929 |
+
)
|
| 1930 |
+
|
| 1931 |
+
# TCRα CDR3 ← Peptide
|
| 1932 |
+
tcra_cdr3_cross, _ = self.cross_attn_tcra_pep(
|
| 1933 |
+
query=tcra_cdr3, # [B, La_cdr3, D]
|
| 1934 |
+
key=pep_feat, value=pep_feat, # [B, Lp, D]
|
| 1935 |
+
key_padding_mask=~batch['pep_mask'].to(self.device)
|
| 1936 |
+
)
|
| 1937 |
+
tcra_cdr3 = self.norm_tcra_pep(tcra_cdr3 + tcra_cdr3_cross)
|
| 1938 |
+
# 重新掩蔽 padding 的 CDR3 位置,防止无效 token 漏光
|
| 1939 |
+
tcra_cdr3 = tcra_cdr3 * cdr3a_mask.unsqueeze(-1)
|
| 1940 |
+
|
| 1941 |
+
# TCRβ CDR3 ← Peptide
|
| 1942 |
+
tcrb_cdr3_cross, _ = self.cross_attn_tcrb_pep(
|
| 1943 |
+
query=tcrb_cdr3,
|
| 1944 |
+
key=pep_feat, value=pep_feat,
|
| 1945 |
+
key_padding_mask=~batch['pep_mask'].to(self.device)
|
| 1946 |
+
)
|
| 1947 |
+
tcrb_cdr3 = self.norm_tcrb_pep(tcrb_cdr3 + tcrb_cdr3_cross)
|
| 1948 |
+
tcrb_cdr3 = tcrb_cdr3 * cdr3b_mask.unsqueeze(-1)
|
| 1949 |
+
|
| 1950 |
+
# ------------------ Cross-Attn:TCR 全序列 ↔ HLA(整条 TCR) ------------------
|
| 1951 |
+
# TCRα full ← HLA
|
| 1952 |
+
tcra_hla_cross, _ = self.cross_attn_tcra_hla(
|
| 1953 |
+
query=tcra_feat, # [B, La, D]
|
| 1954 |
+
key=hla_feat, value=hla_feat, # [B, Lh, D]
|
| 1955 |
+
key_padding_mask=None
|
| 1956 |
+
)
|
| 1957 |
+
tcra_feat = self.norm_tcra_hla(tcra_feat + tcra_hla_cross)
|
| 1958 |
+
tcra_feat = tcra_feat * batch['tcra_mask'].to(self.device).unsqueeze(-1)
|
| 1959 |
+
|
| 1960 |
+
# TCRβ full ← HLA
|
| 1961 |
+
tcrb_hla_cross, _ = self.cross_attn_tcrb_hla(
|
| 1962 |
+
query=tcrb_feat,
|
| 1963 |
+
key=hla_feat, value=hla_feat,
|
| 1964 |
+
key_padding_mask=None
|
| 1965 |
+
)
|
| 1966 |
+
tcrb_feat = self.norm_tcrb_hla(tcrb_feat + tcrb_hla_cross)
|
| 1967 |
+
tcrb_feat = tcrb_feat * batch['tcrb_mask'].to(self.device).unsqueeze(-1)
|
| 1968 |
+
|
| 1969 |
+
# bilinear fusion
|
| 1970 |
+
vec_tcra_pep, attn_tcra_pep = self.bi_tcra_pep(tcra_cdr3, pep_feat, v_mask=cdr3a_mask, q_mask=batch['pep_mask'].to(self.device))
|
| 1971 |
+
vec_tcrb_pep, attn_tcrb_pep = self.bi_tcrb_pep(tcrb_cdr3, pep_feat, v_mask=cdr3b_mask, q_mask=batch['pep_mask'].to(self.device))
|
| 1972 |
+
vec_tcra_hla, attn_tcra_hla = self.bi_tcra_hla(tcra_feat, hla_feat, v_mask=batch['tcra_mask'].to(self.device), q_mask=None)
|
| 1973 |
+
vec_tcrb_hla, attn_tcrb_hla = self.bi_tcrb_hla(tcrb_feat, hla_feat, v_mask=batch['tcrb_mask'].to(self.device), q_mask=None)
|
| 1974 |
+
|
| 1975 |
+
attn_tcra_pep_small = attn_tcra_pep.sum(dim=1).float()
|
| 1976 |
+
attn_tcrb_pep_small = attn_tcrb_pep.sum(dim=1).float()
|
| 1977 |
+
attn_tcra_hla_small = attn_tcra_hla.sum(dim=1).float()
|
| 1978 |
+
attn_tcrb_hla_small = attn_tcrb_hla.sum(dim=1).float()
|
| 1979 |
+
|
| 1980 |
+
attn_dict = {
|
| 1981 |
+
'attn_tcra_pep': attn_tcra_pep_small.detach().cpu().numpy(),
|
| 1982 |
+
'attn_tcrb_pep': attn_tcrb_pep_small.detach().cpu().numpy(),
|
| 1983 |
+
'attn_tcra_hla': attn_tcra_hla_small.detach().cpu().numpy(),
|
| 1984 |
+
'attn_tcrb_hla': attn_tcrb_hla_small.detach().cpu().numpy()
|
| 1985 |
+
}
|
| 1986 |
+
|
| 1987 |
+
fused = torch.cat([vec_tcra_pep, vec_tcrb_pep, vec_tcra_hla, vec_tcrb_hla], dim=-1)
|
| 1988 |
+
logits = self.head(fused).squeeze(-1)
|
| 1989 |
+
|
| 1990 |
+
labels = batch['label'].to(self.device)
|
| 1991 |
+
loss_binding = self.loss_fn(logits, labels.float())
|
| 1992 |
+
|
| 1993 |
+
probs = torch.sigmoid(logits)
|
| 1994 |
+
|
| 1995 |
+
return probs, loss_binding, pep_feat.detach().cpu().numpy(), attn_dict
|
src/phla_cache/hla_coord_dict.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad3f796d0193cb85fa7786581064df26bf50bc75a362fd4687b711af5d65738a
|
| 3 |
+
size 381645
|
src/phla_cache/hla_feat_dict.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08abc4aff2c1afcf26829a14f9e44081f70b70e6aad52f69be39b0d055e6fd87
|
| 3 |
+
size 1878067
|
src/physicochemical.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sklearn.preprocessing import StandardScaler
|
| 6 |
+
|
| 7 |
+
# class PhysicochemicalEncoder(nn.Module):
|
| 8 |
+
# """Amino Acid Physicochemical Property Encoder (AAindex版本)"""
|
| 9 |
+
|
| 10 |
+
# def __init__(self, device, use_aaindex=True, selected_features=None):
|
| 11 |
+
# """
|
| 12 |
+
# Args:
|
| 13 |
+
# device: torch device
|
| 14 |
+
# use_aaindex: 是否使用AAindex特征(True)还是简单的5特征(False)
|
| 15 |
+
# selected_features: 选择使用哪些AAindex特征(None=使用全部)
|
| 16 |
+
# """
|
| 17 |
+
# super().__init__()
|
| 18 |
+
# self.device = device
|
| 19 |
+
# self.use_aaindex = use_aaindex
|
| 20 |
+
|
| 21 |
+
# if use_aaindex:
|
| 22 |
+
# # 从AAindex加载特征
|
| 23 |
+
# self.aa_properties, self.feature_names = self._load_aaindex_features(selected_features)
|
| 24 |
+
# self.n_features = len(list(self.aa_properties['A'].values()))
|
| 25 |
+
# print(f"✓ Loaded {self.n_features} AAindex features")
|
| 26 |
+
# else:
|
| 27 |
+
# # 使用简单的5特征
|
| 28 |
+
# self.aa_properties = self._get_basic_properties()
|
| 29 |
+
# self.n_features = 5
|
| 30 |
+
# print(f"✓ Using {self.n_features} basic features")
|
| 31 |
+
|
| 32 |
+
# # 标准化(重要!不同特征范围差异大)
|
| 33 |
+
# self.scaler = self._fit_scaler()
|
| 34 |
+
|
| 35 |
+
# def _load_aaindex_features(self, selected_features=None):
|
| 36 |
+
# """从AAindex加载特征"""
|
| 37 |
+
# try:
|
| 38 |
+
# # 尝试导入生成的文件
|
| 39 |
+
# from aa_properties_aaindex import AA_PROPERTIES_AAINDEX, FEATURE_DESCRIPTIONS
|
| 40 |
+
|
| 41 |
+
# if selected_features is not None:
|
| 42 |
+
# # 只选择指定的特征
|
| 43 |
+
# filtered_props = {}
|
| 44 |
+
# for aa, props in AA_PROPERTIES_AAINDEX.items():
|
| 45 |
+
# filtered_props[aa] = {k: v for k, v in props.items()
|
| 46 |
+
# if k in selected_features}
|
| 47 |
+
# return filtered_props, selected_features
|
| 48 |
+
# else:
|
| 49 |
+
# # 使用所有特征
|
| 50 |
+
# feature_names = list(AA_PROPERTIES_AAINDEX['A'].keys())
|
| 51 |
+
# return AA_PROPERTIES_AAINDEX, feature_names
|
| 52 |
+
|
| 53 |
+
# except ImportError:
|
| 54 |
+
# print("⚠ Warning: aa_properties_aaindex.py not found!")
|
| 55 |
+
# print(" Falling back to basic 5 features")
|
| 56 |
+
# print(" Run 'python aaindex_downloader.py' to download AAindex features")
|
| 57 |
+
# return self._get_basic_properties(), ['hydro', 'charge', 'volume', 'flex', 'aroma']
|
| 58 |
+
|
| 59 |
+
# def _get_basic_properties(self):
|
| 60 |
+
# """基础的5特征(作为fallback)"""
|
| 61 |
+
# return {
|
| 62 |
+
# 'A': [1.8, 0.0, 88.6, 0.36, 0.0],
|
| 63 |
+
# 'C': [2.5, 0.0, 108.5, 0.35, 0.0],
|
| 64 |
+
# 'D': [-3.5, -1.0, 111.1, 0.51, 0.0],
|
| 65 |
+
# 'E': [-3.5, -1.0, 138.4, 0.50, 0.0],
|
| 66 |
+
# 'F': [2.8, 0.0, 189.9, 0.31, 1.0],
|
| 67 |
+
# 'G': [-0.4, 0.0, 60.1, 0.54, 0.0],
|
| 68 |
+
# 'H': [-3.2, 0.5, 153.2, 0.32, 0.5],
|
| 69 |
+
# 'I': [4.5, 0.0, 166.7, 0.46, 0.0],
|
| 70 |
+
# 'K': [-3.9, 1.0, 168.6, 0.47, 0.0],
|
| 71 |
+
# 'L': [3.8, 0.0, 166.7, 0.37, 0.0],
|
| 72 |
+
# 'M': [1.9, 0.0, 162.9, 0.30, 0.0],
|
| 73 |
+
# 'N': [-3.5, 0.0, 114.1, 0.46, 0.0],
|
| 74 |
+
# 'P': [-1.6, 0.0, 112.7, 0.51, 0.0],
|
| 75 |
+
# 'Q': [-3.5, 0.0, 143.8, 0.49, 0.0],
|
| 76 |
+
# 'R': [-4.5, 1.0, 173.4, 0.53, 0.0],
|
| 77 |
+
# 'S': [-0.8, 0.0, 89.0, 0.51, 0.0],
|
| 78 |
+
# 'T': [-0.7, 0.0, 116.1, 0.44, 0.0],
|
| 79 |
+
# 'V': [4.2, 0.0, 140.0, 0.39, 0.0],
|
| 80 |
+
# 'W': [-0.9, 0.0, 227.8, 0.31, 1.0],
|
| 81 |
+
# 'Y': [-1.3, 0.0, 193.6, 0.42, 1.0],
|
| 82 |
+
# 'X': [0.0, 0.0, 120.0, 0.40, 0.0],
|
| 83 |
+
# }
|
| 84 |
+
|
| 85 |
+
# def _fit_scaler(self):
|
| 86 |
+
# """拟合标准化器"""
|
| 87 |
+
# # 收集所有氨基酸的特征
|
| 88 |
+
# all_features = []
|
| 89 |
+
# for aa in 'ARNDCQEGHILKMFPSTWYV': # 20种标准氨基酸
|
| 90 |
+
# if isinstance(self.aa_properties[aa], dict):
|
| 91 |
+
# # AAindex格式
|
| 92 |
+
# features = list(self.aa_properties[aa].values())
|
| 93 |
+
# else:
|
| 94 |
+
# # 列表格式
|
| 95 |
+
# features = self.aa_properties[aa]
|
| 96 |
+
# all_features.append(features)
|
| 97 |
+
|
| 98 |
+
# all_features = np.array(all_features)
|
| 99 |
+
|
| 100 |
+
# # Z-score标准化
|
| 101 |
+
# scaler = StandardScaler()
|
| 102 |
+
# scaler.fit(all_features)
|
| 103 |
+
|
| 104 |
+
# return scaler
|
| 105 |
+
|
| 106 |
+
# def _get_aa_features(self, aa: str) -> List[float]:
|
| 107 |
+
# """获取单个氨基酸的特征"""
|
| 108 |
+
# aa = aa.upper()
|
| 109 |
+
# if aa not in self.aa_properties:
|
| 110 |
+
# aa = 'X' # Unknown
|
| 111 |
+
|
| 112 |
+
# if isinstance(self.aa_properties[aa], dict):
|
| 113 |
+
# # AAindex格式:字典
|
| 114 |
+
# features = list(self.aa_properties[aa].values())
|
| 115 |
+
# else:
|
| 116 |
+
# # 基础格式:列表
|
| 117 |
+
# features = self.aa_properties[aa]
|
| 118 |
+
|
| 119 |
+
# return features
|
| 120 |
+
|
| 121 |
+
# def forward(self, sequences: List[str]) -> torch.Tensor:
|
| 122 |
+
# """
|
| 123 |
+
# Args:
|
| 124 |
+
# sequences: List of amino acid sequences
|
| 125 |
+
# Returns:
|
| 126 |
+
# [B, max_len, n_features] 标准化后的特征
|
| 127 |
+
# """
|
| 128 |
+
# batch_size = len(sequences)
|
| 129 |
+
# max_len = max(len(seq) for seq in sequences)
|
| 130 |
+
|
| 131 |
+
# # 收集特征
|
| 132 |
+
# properties = []
|
| 133 |
+
# for seq in sequences:
|
| 134 |
+
# seq_props = []
|
| 135 |
+
# for aa in seq:
|
| 136 |
+
# props = self._get_aa_features(aa)
|
| 137 |
+
# seq_props.append(props)
|
| 138 |
+
|
| 139 |
+
# # Padding
|
| 140 |
+
# while len(seq_props) < max_len:
|
| 141 |
+
# seq_props.append([0.0] * self.n_features)
|
| 142 |
+
|
| 143 |
+
# properties.append(seq_props)
|
| 144 |
+
|
| 145 |
+
# properties = np.array(properties) # [B, L, n_features]
|
| 146 |
+
|
| 147 |
+
# # 标准化(除了padding位置)
|
| 148 |
+
# batch_size, seq_len, n_feat = properties.shape
|
| 149 |
+
# properties_flat = properties.reshape(-1, n_feat)
|
| 150 |
+
|
| 151 |
+
# # 标准化
|
| 152 |
+
# properties_normalized = self.scaler.transform(properties_flat)
|
| 153 |
+
# properties_normalized = properties_normalized.reshape(batch_size, seq_len, n_feat)
|
| 154 |
+
|
| 155 |
+
# # 转为tensor
|
| 156 |
+
# properties_tensor = torch.tensor(
|
| 157 |
+
# properties_normalized,
|
| 158 |
+
# dtype=torch.float32,
|
| 159 |
+
# device=self.device
|
| 160 |
+
# )
|
| 161 |
+
|
| 162 |
+
# return properties_tensor # [B, L, n_features]
|
| 163 |
+
|
| 164 |
+
import torch
|
| 165 |
+
import torch.nn as nn
|
| 166 |
+
import numpy as np
|
| 167 |
+
from sklearn.preprocessing import StandardScaler
|
| 168 |
+
from typing import List
|
| 169 |
+
|
| 170 |
+
class PhysicochemicalEncoder(nn.Module):
|
| 171 |
+
"""Amino Acid Physicochemical Property Encoder (AAindex版本, 向量化优化版)"""
|
| 172 |
+
|
| 173 |
+
def __init__(self, device, use_aaindex=True, selected_features=None):
|
| 174 |
+
super().__init__()
|
| 175 |
+
self.device = device
|
| 176 |
+
self.use_aaindex = use_aaindex
|
| 177 |
+
|
| 178 |
+
# 加载特征
|
| 179 |
+
if use_aaindex:
|
| 180 |
+
self.aa_properties, self.feature_names = self._load_aaindex_features(selected_features)
|
| 181 |
+
self.n_features = len(list(self.aa_properties['A'].values()))
|
| 182 |
+
print(f"✓ Loaded {self.n_features} AAindex features")
|
| 183 |
+
else:
|
| 184 |
+
self.aa_properties = self._get_basic_properties()
|
| 185 |
+
self.n_features = 5
|
| 186 |
+
print(f"✓ Using {self.n_features} basic features")
|
| 187 |
+
|
| 188 |
+
# 拟合标准化器
|
| 189 |
+
self.scaler = self._fit_scaler()
|
| 190 |
+
|
| 191 |
+
# ======================== 🔥 预处理部分 ======================== #
|
| 192 |
+
# 1. 构建 lookup table
|
| 193 |
+
aa_list = list(self.aa_properties.keys())
|
| 194 |
+
aa_list.sort() # 保证稳定顺序
|
| 195 |
+
self.aa_to_idx = {aa: i for i, aa in enumerate(aa_list)}
|
| 196 |
+
self.pad_idx = len(self.aa_to_idx) # padding index
|
| 197 |
+
|
| 198 |
+
aa_feature_table = []
|
| 199 |
+
for aa in aa_list:
|
| 200 |
+
feats = self._get_aa_features(aa)
|
| 201 |
+
aa_feature_table.append(feats)
|
| 202 |
+
aa_feature_table.append([0.0] * self.n_features) # padding vector
|
| 203 |
+
self.aa_feature_table = torch.tensor(
|
| 204 |
+
np.array(aa_feature_table),
|
| 205 |
+
dtype=torch.float32
|
| 206 |
+
).to(self.device) # [n_aa+1, n_feat]
|
| 207 |
+
|
| 208 |
+
# 2. 标准化参数预存成 GPU tensor
|
| 209 |
+
self.mean_tensor = torch.tensor(self.scaler.mean_, dtype=torch.float32, device=self.device)
|
| 210 |
+
self.scale_tensor = torch.tensor(self.scaler.scale_, dtype=torch.float32, device=self.device)
|
| 211 |
+
|
| 212 |
+
# 下面这些函数和你原来的完全一致,不动
|
| 213 |
+
def _load_aaindex_features(self, selected_features=None):
|
| 214 |
+
try:
|
| 215 |
+
from aa_properties_aaindex import AA_PROPERTIES_AAINDEX, FEATURE_DESCRIPTIONS
|
| 216 |
+
if selected_features is not None:
|
| 217 |
+
filtered_props = {}
|
| 218 |
+
for aa, props in AA_PROPERTIES_AAINDEX.items():
|
| 219 |
+
filtered_props[aa] = {k: v for k, v in props.items() if k in selected_features}
|
| 220 |
+
return filtered_props, selected_features
|
| 221 |
+
else:
|
| 222 |
+
feature_names = list(AA_PROPERTIES_AAINDEX['A'].keys())
|
| 223 |
+
return AA_PROPERTIES_AAINDEX, feature_names
|
| 224 |
+
except ImportError:
|
| 225 |
+
print("⚠ Warning: aa_properties_aaindex.py not found!")
|
| 226 |
+
return self._get_basic_properties(), ['hydro', 'charge', 'volume', 'flex', 'aroma']
|
| 227 |
+
|
| 228 |
+
def _get_basic_properties(self):
|
| 229 |
+
# 这里同你原来的
|
| 230 |
+
return {
|
| 231 |
+
'A': [1.8, 0.0, 88.6, 0.36, 0.0],
|
| 232 |
+
'C': [2.5, 0.0, 108.5, 0.35, 0.0],
|
| 233 |
+
'D': [-3.5, -1.0, 111.1, 0.51, 0.0],
|
| 234 |
+
'E': [-3.5, -1.0, 138.4, 0.50, 0.0],
|
| 235 |
+
'F': [2.8, 0.0, 189.9, 0.31, 1.0],
|
| 236 |
+
'G': [-0.4, 0.0, 60.1, 0.54, 0.0],
|
| 237 |
+
'H': [-3.2, 0.5, 153.2, 0.32, 0.5],
|
| 238 |
+
'I': [4.5, 0.0, 166.7, 0.46, 0.0],
|
| 239 |
+
'K': [-3.9, 1.0, 168.6, 0.47, 0.0],
|
| 240 |
+
'L': [3.8, 0.0, 166.7, 0.37, 0.0],
|
| 241 |
+
'M': [1.9, 0.0, 162.9, 0.30, 0.0],
|
| 242 |
+
'N': [-3.5, 0.0, 114.1, 0.46, 0.0],
|
| 243 |
+
'P': [-1.6, 0.0, 112.7, 0.51, 0.0],
|
| 244 |
+
'Q': [-3.5, 0.0, 143.8, 0.49, 0.0],
|
| 245 |
+
'R': [-4.5, 1.0, 173.4, 0.53, 0.0],
|
| 246 |
+
'S': [-0.8, 0.0, 89.0, 0.51, 0.0],
|
| 247 |
+
'T': [-0.7, 0.0, 116.1, 0.44, 0.0],
|
| 248 |
+
'V': [4.2, 0.0, 140.0, 0.39, 0.0],
|
| 249 |
+
'W': [-0.9, 0.0, 227.8, 0.31, 1.0],
|
| 250 |
+
'Y': [-1.3, 0.0, 193.6, 0.42, 1.0],
|
| 251 |
+
'X': [0.0, 0.0, 120.0, 0.40, 0.0],
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
def _fit_scaler(self):
|
| 255 |
+
all_features = []
|
| 256 |
+
for aa in 'ARNDCQEGHILKMFPSTWYV':
|
| 257 |
+
if isinstance(self.aa_properties[aa], dict):
|
| 258 |
+
features = list(self.aa_properties[aa].values())
|
| 259 |
+
else:
|
| 260 |
+
features = self.aa_properties[aa]
|
| 261 |
+
all_features.append(features)
|
| 262 |
+
all_features = np.array(all_features)
|
| 263 |
+
scaler = StandardScaler()
|
| 264 |
+
scaler.fit(all_features)
|
| 265 |
+
return scaler
|
| 266 |
+
|
| 267 |
+
def _get_aa_features(self, aa: str):
|
| 268 |
+
aa = aa.upper()
|
| 269 |
+
if aa not in self.aa_properties:
|
| 270 |
+
aa = 'X'
|
| 271 |
+
if isinstance(self.aa_properties[aa], dict):
|
| 272 |
+
return list(self.aa_properties[aa].values())
|
| 273 |
+
else:
|
| 274 |
+
return self.aa_properties[aa]
|
| 275 |
+
|
| 276 |
+
def forward(self, sequences: List[str]) -> torch.Tensor:
|
| 277 |
+
batch_size = len(sequences)
|
| 278 |
+
max_len = max(len(seq) for seq in sequences)
|
| 279 |
+
|
| 280 |
+
# 1) encode sequences to indices with padding
|
| 281 |
+
idx_batch = np.full((batch_size, max_len), self.pad_idx, dtype=np.int64)
|
| 282 |
+
for i, seq in enumerate(sequences):
|
| 283 |
+
idx_seq = [self.aa_to_idx.get(aa.upper(), self.pad_idx) for aa in seq]
|
| 284 |
+
idx_batch[i, :len(idx_seq)] = idx_seq
|
| 285 |
+
|
| 286 |
+
idx_tensor = torch.tensor(idx_batch, dtype=torch.long, device=self.device) # [B, L]
|
| 287 |
+
|
| 288 |
+
# 2) lookup properties
|
| 289 |
+
props = self.aa_feature_table[idx_tensor] # [B, L, n_feat]
|
| 290 |
+
|
| 291 |
+
props = (props - self.mean_tensor) / self.scale_tensor
|
| 292 |
+
|
| 293 |
+
return props
|
src/predictor.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from main import StriMap_pHLA, StriMap_TCRpHLA, load_test_data
|
| 3 |
+
|
| 4 |
+
def load_model(model_path="model.pt", device=None):
|
| 5 |
+
model = StriMap_pHLA(
|
| 6 |
+
device=device,
|
| 7 |
+
model_save_path=model_path,
|
| 8 |
+
cache_save=False,
|
| 9 |
+
)
|
| 10 |
+
model.load_model(model_path)
|
| 11 |
+
return model, device
|
| 12 |
+
|
| 13 |
+
def predict_from_df(df, model):
|
| 14 |
+
df = load_test_data(
|
| 15 |
+
df_test=df,
|
| 16 |
+
hla_dict_path='HLA_dict.npy',
|
| 17 |
+
)
|
| 18 |
+
model.prepare_embeddings(
|
| 19 |
+
df,
|
| 20 |
+
force_recompute=False,
|
| 21 |
+
)
|
| 22 |
+
df['label'] = 1
|
| 23 |
+
torch.cuda.empty_cache()
|
| 24 |
+
predictions, _ = model.predict(df, batch_size=128, return_probs=True, use_kfold=False)
|
| 25 |
+
df["Prediction"] = predictions
|
| 26 |
+
# remove label
|
| 27 |
+
df = df.drop(columns=['label'])
|
| 28 |
+
return df
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,59 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
import streamlit as st
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from io import StringIO
|
| 4 |
+
from predictor import load_model, predict_from_df
|
| 5 |
+
from Bio import SeqIO
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
st.set_page_config(page_title="🧬 Peptide–HLA Binding Predictor", layout="wide")
|
| 9 |
+
|
| 10 |
+
st.title("🧠 Peptide–HLA Binding Predictor")
|
| 11 |
+
st.markdown("""
|
| 12 |
+
Upload a **CSV** file with columns `Peptide` and `HLA`,
|
| 13 |
+
or a **FASTA** file containing peptide sequences (headers optionally include HLA type).
|
| 14 |
+
""")
|
| 15 |
+
|
| 16 |
+
uploaded_file = st.file_uploader("Upload CSV or FASTA", type=["csv", "fasta"])
|
| 17 |
+
|
| 18 |
+
# 加载模型
|
| 19 |
+
@st.cache_resource
|
| 20 |
+
def get_model():
|
| 21 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 22 |
+
model, device = load_model("model.pt", device=device)
|
| 23 |
+
return model, device
|
| 24 |
+
|
| 25 |
+
model, device = get_model()
|
| 26 |
+
|
| 27 |
+
if uploaded_file:
|
| 28 |
+
if uploaded_file.name.endswith(".csv"):
|
| 29 |
+
df = pd.read_csv(uploaded_file)
|
| 30 |
+
else:
|
| 31 |
+
seqs = []
|
| 32 |
+
for rec in SeqIO.parse(uploaded_file, "fasta"):
|
| 33 |
+
header = rec.id
|
| 34 |
+
seq = str(rec.seq)
|
| 35 |
+
# 尝试从header提取HLA,比如 ">HLA-A*02:01|SLLMWITQC"
|
| 36 |
+
if "|" in header:
|
| 37 |
+
hla, _ = header.split("|", 1)
|
| 38 |
+
else:
|
| 39 |
+
hla = "HLA-Unknown"
|
| 40 |
+
seqs.append([seq, hla])
|
| 41 |
+
df = pd.DataFrame(seqs, columns=["Peptide", "HLA"])
|
| 42 |
+
|
| 43 |
+
st.write("✅ Uploaded data preview:")
|
| 44 |
+
st.dataframe(df.head())
|
| 45 |
+
|
| 46 |
+
if st.button("🚀 Run Prediction"):
|
| 47 |
+
with st.spinner("Running model inference..."):
|
| 48 |
+
result_df = predict_from_df(df, model)
|
| 49 |
+
|
| 50 |
+
st.success("✅ Prediction complete!")
|
| 51 |
+
st.dataframe(result_df.head(10))
|
| 52 |
+
|
| 53 |
+
csv = result_df.to_csv(index=False).encode("utf-8")
|
| 54 |
+
st.download_button(
|
| 55 |
+
"⬇️ Download results as CSV",
|
| 56 |
+
data=csv,
|
| 57 |
+
file_name="hla_binding_predictions.csv",
|
| 58 |
+
mime="text/csv",
|
| 59 |
+
)
|
src/streamlit_app0.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import altair as alt
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
# Welcome to Streamlit!
|
| 8 |
+
|
| 9 |
+
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
+
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
+
forums](https://discuss.streamlit.io).
|
| 12 |
+
|
| 13 |
+
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
+
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
+
|
| 19 |
+
indices = np.linspace(0, 1, num_points)
|
| 20 |
+
theta = 2 * np.pi * num_turns * indices
|
| 21 |
+
radius = indices
|
| 22 |
+
|
| 23 |
+
x = radius * np.cos(theta)
|
| 24 |
+
y = radius * np.sin(theta)
|
| 25 |
+
|
| 26 |
+
df = pd.DataFrame({
|
| 27 |
+
"x": x,
|
| 28 |
+
"y": y,
|
| 29 |
+
"idx": indices,
|
| 30 |
+
"rand": np.random.randn(num_points),
|
| 31 |
+
})
|
| 32 |
+
|
| 33 |
+
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
+
.mark_point(filled=True)
|
| 35 |
+
.encode(
|
| 36 |
+
x=alt.X("x", axis=None),
|
| 37 |
+
y=alt.Y("y", axis=None),
|
| 38 |
+
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
+
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
+
))
|