cao commited on
Commit
78f28d5
·
1 Parent(s): d5362a7

Add model and predictor files

Browse files
src/HLA_dict.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2cee1e02e5548817e06e60b258f57dd27a3707dff357656cd956cab81adf2e6
3
+ size 3287055
src/aa_properties_aaindex.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Amino Acid Properties from AAindex Database
3
+ Auto-generated by AAindexDownloader
4
+
5
+ Total features: 20
6
+ """
7
+
8
+ import numpy as np
9
+
10
+ # Raw values from AAindex
11
+ AA_PROPERTIES_AAINDEX = {
12
+ 'A': {
13
+ 'BIGC670101': 52.600000,
14
+ 'CHAM820101': 0.046000,
15
+ 'CHOP780201': 1.420000,
16
+ 'CHOP780202': 0.830000,
17
+ 'CHOP780203': 0.740000,
18
+ 'EISD860101': 0.670000,
19
+ 'FASG760101': 89.090000,
20
+ 'FAUJ830101': 0.310000,
21
+ 'GRAR740102': 8.100000,
22
+ 'GRAR740103': 31.000000,
23
+ 'GUYH850101': 0.100000,
24
+ 'HOPT810101': -0.500000,
25
+ 'JANJ780101': 27.800000,
26
+ 'KARP850101': 1.041000,
27
+ 'KYTJ820101': 1.800000,
28
+ 'ROSM880101': -0.670000,
29
+ 'VINM940101': 0.984000,
30
+ 'WERD780101': 0.520000,
31
+ 'ZIMJ680101': 0.830000,
32
+ 'ZIMJ680104': 6.000000,
33
+ },
34
+ 'R': {
35
+ 'BIGC670101': 109.100000,
36
+ 'CHAM820101': 0.291000,
37
+ 'CHOP780201': 0.980000,
38
+ 'CHOP780202': 0.930000,
39
+ 'CHOP780203': 1.010000,
40
+ 'EISD860101': -2.100000,
41
+ 'FASG760101': 174.200000,
42
+ 'FAUJ830101': -1.010000,
43
+ 'GRAR740102': 10.500000,
44
+ 'GRAR740103': 124.000000,
45
+ 'GUYH850101': 1.910000,
46
+ 'HOPT810101': 3.000000,
47
+ 'JANJ780101': 94.700000,
48
+ 'KARP850101': 1.038000,
49
+ 'KYTJ820101': -4.500000,
50
+ 'ROSM880101': 12.100000,
51
+ 'VINM940101': 1.008000,
52
+ 'WERD780101': 0.490000,
53
+ 'ZIMJ680101': 0.830000,
54
+ 'ZIMJ680104': 10.760000,
55
+ },
56
+ 'N': {
57
+ 'BIGC670101': 75.700000,
58
+ 'CHAM820101': 0.134000,
59
+ 'CHOP780201': 0.670000,
60
+ 'CHOP780202': 0.890000,
61
+ 'CHOP780203': 1.460000,
62
+ 'EISD860101': -0.600000,
63
+ 'FASG760101': 132.120000,
64
+ 'FAUJ830101': -0.600000,
65
+ 'GRAR740102': 11.600000,
66
+ 'GRAR740103': 56.000000,
67
+ 'GUYH850101': 0.480000,
68
+ 'HOPT810101': 0.200000,
69
+ 'JANJ780101': 60.100000,
70
+ 'KARP850101': 1.117000,
71
+ 'KYTJ820101': -3.500000,
72
+ 'ROSM880101': 7.230000,
73
+ 'VINM940101': 1.048000,
74
+ 'WERD780101': 0.420000,
75
+ 'ZIMJ680101': 0.090000,
76
+ 'ZIMJ680104': 5.410000,
77
+ },
78
+ 'D': {
79
+ 'BIGC670101': 68.400000,
80
+ 'CHAM820101': 0.105000,
81
+ 'CHOP780201': 1.010000,
82
+ 'CHOP780202': 0.540000,
83
+ 'CHOP780203': 1.520000,
84
+ 'EISD860101': -1.200000,
85
+ 'FASG760101': 133.100000,
86
+ 'FAUJ830101': -0.770000,
87
+ 'GRAR740102': 13.000000,
88
+ 'GRAR740103': 54.000000,
89
+ 'GUYH850101': 0.780000,
90
+ 'HOPT810101': 3.000000,
91
+ 'JANJ780101': 60.600000,
92
+ 'KARP850101': 1.033000,
93
+ 'KYTJ820101': -3.500000,
94
+ 'ROSM880101': 8.720000,
95
+ 'VINM940101': 1.068000,
96
+ 'WERD780101': 0.370000,
97
+ 'ZIMJ680101': 0.640000,
98
+ 'ZIMJ680104': 2.770000,
99
+ },
100
+ 'C': {
101
+ 'BIGC670101': 68.300000,
102
+ 'CHAM820101': 0.128000,
103
+ 'CHOP780201': 0.700000,
104
+ 'CHOP780202': 1.190000,
105
+ 'CHOP780203': 0.960000,
106
+ 'EISD860101': 0.380000,
107
+ 'FASG760101': 121.150000,
108
+ 'FAUJ830101': 1.540000,
109
+ 'GRAR740102': 5.500000,
110
+ 'GRAR740103': 55.000000,
111
+ 'GUYH850101': -1.420000,
112
+ 'HOPT810101': -1.000000,
113
+ 'JANJ780101': 15.500000,
114
+ 'KARP850101': 0.960000,
115
+ 'KYTJ820101': 2.500000,
116
+ 'ROSM880101': -0.340000,
117
+ 'VINM940101': 0.906000,
118
+ 'WERD780101': 0.830000,
119
+ 'ZIMJ680101': 1.480000,
120
+ 'ZIMJ680104': 5.050000,
121
+ },
122
+ 'Q': {
123
+ 'BIGC670101': 89.700000,
124
+ 'CHAM820101': 0.180000,
125
+ 'CHOP780201': 1.110000,
126
+ 'CHOP780202': 1.100000,
127
+ 'CHOP780203': 0.960000,
128
+ 'EISD860101': -0.220000,
129
+ 'FASG760101': 146.150000,
130
+ 'FAUJ830101': -0.220000,
131
+ 'GRAR740102': 10.500000,
132
+ 'GRAR740103': 85.000000,
133
+ 'GUYH850101': 0.950000,
134
+ 'HOPT810101': 0.200000,
135
+ 'JANJ780101': 68.700000,
136
+ 'KARP850101': 1.165000,
137
+ 'KYTJ820101': -3.500000,
138
+ 'ROSM880101': 6.390000,
139
+ 'VINM940101': 1.037000,
140
+ 'WERD780101': 0.350000,
141
+ 'ZIMJ680101': 0.000000,
142
+ 'ZIMJ680104': 5.650000,
143
+ },
144
+ 'E': {
145
+ 'BIGC670101': 84.700000,
146
+ 'CHAM820101': 0.151000,
147
+ 'CHOP780201': 1.510000,
148
+ 'CHOP780202': 0.370000,
149
+ 'CHOP780203': 0.950000,
150
+ 'EISD860101': -0.760000,
151
+ 'FASG760101': 147.130000,
152
+ 'FAUJ830101': -0.640000,
153
+ 'GRAR740102': 12.300000,
154
+ 'GRAR740103': 83.000000,
155
+ 'GUYH850101': 0.830000,
156
+ 'HOPT810101': 3.000000,
157
+ 'JANJ780101': 68.200000,
158
+ 'KARP850101': 1.094000,
159
+ 'KYTJ820101': -3.500000,
160
+ 'ROSM880101': 7.350000,
161
+ 'VINM940101': 1.094000,
162
+ 'WERD780101': 0.380000,
163
+ 'ZIMJ680101': 0.650000,
164
+ 'ZIMJ680104': 3.220000,
165
+ },
166
+ 'G': {
167
+ 'BIGC670101': 36.300000,
168
+ 'CHAM820101': 0.000000,
169
+ 'CHOP780201': 0.570000,
170
+ 'CHOP780202': 0.750000,
171
+ 'CHOP780203': 1.560000,
172
+ 'EISD860101': 0.000000,
173
+ 'FASG760101': 75.070000,
174
+ 'FAUJ830101': 0.000000,
175
+ 'GRAR740102': 9.000000,
176
+ 'GRAR740103': 3.000000,
177
+ 'GUYH850101': 0.330000,
178
+ 'HOPT810101': 0.000000,
179
+ 'JANJ780101': 24.500000,
180
+ 'KARP850101': 1.142000,
181
+ 'KYTJ820101': -0.400000,
182
+ 'ROSM880101': 0.000000,
183
+ 'VINM940101': 1.031000,
184
+ 'WERD780101': 0.410000,
185
+ 'ZIMJ680101': 0.100000,
186
+ 'ZIMJ680104': 5.970000,
187
+ },
188
+ 'H': {
189
+ 'BIGC670101': 91.900000,
190
+ 'CHAM820101': 0.230000,
191
+ 'CHOP780201': 1.000000,
192
+ 'CHOP780202': 0.870000,
193
+ 'CHOP780203': 0.950000,
194
+ 'EISD860101': 0.640000,
195
+ 'FASG760101': 155.160000,
196
+ 'FAUJ830101': 0.130000,
197
+ 'GRAR740102': 10.400000,
198
+ 'GRAR740103': 96.000000,
199
+ 'GUYH850101': -0.500000,
200
+ 'HOPT810101': -0.500000,
201
+ 'JANJ780101': 50.700000,
202
+ 'KARP850101': 0.982000,
203
+ 'KYTJ820101': -3.200000,
204
+ 'ROSM880101': 3.820000,
205
+ 'VINM940101': 0.950000,
206
+ 'WERD780101': 0.700000,
207
+ 'ZIMJ680101': 1.100000,
208
+ 'ZIMJ680104': 7.590000,
209
+ },
210
+ 'I': {
211
+ 'BIGC670101': 102.000000,
212
+ 'CHAM820101': 0.186000,
213
+ 'CHOP780201': 1.080000,
214
+ 'CHOP780202': 1.600000,
215
+ 'CHOP780203': 0.470000,
216
+ 'EISD860101': 1.900000,
217
+ 'FASG760101': 131.170000,
218
+ 'FAUJ830101': 1.800000,
219
+ 'GRAR740102': 5.200000,
220
+ 'GRAR740103': 111.000000,
221
+ 'GUYH850101': -1.130000,
222
+ 'HOPT810101': -1.800000,
223
+ 'JANJ780101': 22.800000,
224
+ 'KARP850101': 1.002000,
225
+ 'KYTJ820101': 4.500000,
226
+ 'ROSM880101': -3.020000,
227
+ 'VINM940101': 0.927000,
228
+ 'WERD780101': 0.790000,
229
+ 'ZIMJ680101': 3.070000,
230
+ 'ZIMJ680104': 6.020000,
231
+ },
232
+ 'L': {
233
+ 'BIGC670101': 102.000000,
234
+ 'CHAM820101': 0.186000,
235
+ 'CHOP780201': 1.210000,
236
+ 'CHOP780202': 1.300000,
237
+ 'CHOP780203': 0.500000,
238
+ 'EISD860101': 1.900000,
239
+ 'FASG760101': 131.170000,
240
+ 'FAUJ830101': 1.700000,
241
+ 'GRAR740102': 4.900000,
242
+ 'GRAR740103': 111.000000,
243
+ 'GUYH850101': -1.180000,
244
+ 'HOPT810101': -1.800000,
245
+ 'JANJ780101': 27.600000,
246
+ 'KARP850101': 0.967000,
247
+ 'KYTJ820101': 3.800000,
248
+ 'ROSM880101': -3.020000,
249
+ 'VINM940101': 0.935000,
250
+ 'WERD780101': 0.770000,
251
+ 'ZIMJ680101': 2.520000,
252
+ 'ZIMJ680104': 5.980000,
253
+ },
254
+ 'K': {
255
+ 'BIGC670101': 105.100000,
256
+ 'CHAM820101': 0.219000,
257
+ 'CHOP780201': 1.160000,
258
+ 'CHOP780202': 0.740000,
259
+ 'CHOP780203': 1.190000,
260
+ 'EISD860101': -0.570000,
261
+ 'FASG760101': 146.190000,
262
+ 'FAUJ830101': -0.990000,
263
+ 'GRAR740102': 11.300000,
264
+ 'GRAR740103': 119.000000,
265
+ 'GUYH850101': 1.400000,
266
+ 'HOPT810101': 3.000000,
267
+ 'JANJ780101': 103.000000,
268
+ 'KARP850101': 1.093000,
269
+ 'KYTJ820101': -3.900000,
270
+ 'ROSM880101': 6.130000,
271
+ 'VINM940101': 1.102000,
272
+ 'WERD780101': 0.310000,
273
+ 'ZIMJ680101': 1.600000,
274
+ 'ZIMJ680104': 9.740000,
275
+ },
276
+ 'M': {
277
+ 'BIGC670101': 97.700000,
278
+ 'CHAM820101': 0.221000,
279
+ 'CHOP780201': 1.450000,
280
+ 'CHOP780202': 1.050000,
281
+ 'CHOP780203': 0.600000,
282
+ 'EISD860101': 2.400000,
283
+ 'FASG760101': 149.210000,
284
+ 'FAUJ830101': 1.230000,
285
+ 'GRAR740102': 5.700000,
286
+ 'GRAR740103': 105.000000,
287
+ 'GUYH850101': -1.590000,
288
+ 'HOPT810101': -1.300000,
289
+ 'JANJ780101': 33.500000,
290
+ 'KARP850101': 0.947000,
291
+ 'KYTJ820101': 1.900000,
292
+ 'ROSM880101': -1.300000,
293
+ 'VINM940101': 0.952000,
294
+ 'WERD780101': 0.760000,
295
+ 'ZIMJ680101': 1.400000,
296
+ 'ZIMJ680104': 5.740000,
297
+ },
298
+ 'F': {
299
+ 'BIGC670101': 113.900000,
300
+ 'CHAM820101': 0.290000,
301
+ 'CHOP780201': 1.130000,
302
+ 'CHOP780202': 1.380000,
303
+ 'CHOP780203': 0.660000,
304
+ 'EISD860101': 2.300000,
305
+ 'FASG760101': 165.190000,
306
+ 'FAUJ830101': 1.790000,
307
+ 'GRAR740102': 5.200000,
308
+ 'GRAR740103': 132.000000,
309
+ 'GUYH850101': -2.120000,
310
+ 'HOPT810101': -2.500000,
311
+ 'JANJ780101': 25.500000,
312
+ 'KARP850101': 0.930000,
313
+ 'KYTJ820101': 2.800000,
314
+ 'ROSM880101': -3.240000,
315
+ 'VINM940101': 0.915000,
316
+ 'WERD780101': 0.870000,
317
+ 'ZIMJ680101': 2.750000,
318
+ 'ZIMJ680104': 5.480000,
319
+ },
320
+ 'P': {
321
+ 'BIGC670101': 73.600000,
322
+ 'CHAM820101': 0.131000,
323
+ 'CHOP780201': 0.570000,
324
+ 'CHOP780202': 0.550000,
325
+ 'CHOP780203': 1.560000,
326
+ 'EISD860101': 1.200000,
327
+ 'FASG760101': 115.130000,
328
+ 'FAUJ830101': 0.720000,
329
+ 'GRAR740102': 8.000000,
330
+ 'GRAR740103': 32.500000,
331
+ 'GUYH850101': 0.730000,
332
+ 'HOPT810101': 0.000000,
333
+ 'JANJ780101': 51.500000,
334
+ 'KARP850101': 1.055000,
335
+ 'KYTJ820101': -1.600000,
336
+ 'ROSM880101': -1.750000,
337
+ 'VINM940101': 1.049000,
338
+ 'WERD780101': 0.350000,
339
+ 'ZIMJ680101': 2.700000,
340
+ 'ZIMJ680104': 6.300000,
341
+ },
342
+ 'S': {
343
+ 'BIGC670101': 54.900000,
344
+ 'CHAM820101': 0.062000,
345
+ 'CHOP780201': 0.770000,
346
+ 'CHOP780202': 0.750000,
347
+ 'CHOP780203': 1.430000,
348
+ 'EISD860101': 0.010000,
349
+ 'FASG760101': 105.090000,
350
+ 'FAUJ830101': -0.040000,
351
+ 'GRAR740102': 9.200000,
352
+ 'GRAR740103': 32.000000,
353
+ 'GUYH850101': 0.520000,
354
+ 'HOPT810101': 0.300000,
355
+ 'JANJ780101': 42.000000,
356
+ 'KARP850101': 1.169000,
357
+ 'KYTJ820101': -0.800000,
358
+ 'ROSM880101': 4.350000,
359
+ 'VINM940101': 1.046000,
360
+ 'WERD780101': 0.490000,
361
+ 'ZIMJ680101': 0.140000,
362
+ 'ZIMJ680104': 5.680000,
363
+ },
364
+ 'T': {
365
+ 'BIGC670101': 71.200000,
366
+ 'CHAM820101': 0.108000,
367
+ 'CHOP780201': 0.830000,
368
+ 'CHOP780202': 1.190000,
369
+ 'CHOP780203': 0.980000,
370
+ 'EISD860101': 0.520000,
371
+ 'FASG760101': 119.120000,
372
+ 'FAUJ830101': 0.260000,
373
+ 'GRAR740102': 8.600000,
374
+ 'GRAR740103': 61.000000,
375
+ 'GUYH850101': 0.070000,
376
+ 'HOPT810101': -0.400000,
377
+ 'JANJ780101': 45.000000,
378
+ 'KARP850101': 1.073000,
379
+ 'KYTJ820101': -0.700000,
380
+ 'ROSM880101': 3.860000,
381
+ 'VINM940101': 0.997000,
382
+ 'WERD780101': 0.380000,
383
+ 'ZIMJ680101': 0.540000,
384
+ 'ZIMJ680104': 5.660000,
385
+ },
386
+ 'W': {
387
+ 'BIGC670101': 135.400000,
388
+ 'CHAM820101': 0.409000,
389
+ 'CHOP780201': 1.080000,
390
+ 'CHOP780202': 1.370000,
391
+ 'CHOP780203': 0.600000,
392
+ 'EISD860101': 2.600000,
393
+ 'FASG760101': 204.240000,
394
+ 'FAUJ830101': 2.250000,
395
+ 'GRAR740102': 5.400000,
396
+ 'GRAR740103': 170.000000,
397
+ 'GUYH850101': -0.510000,
398
+ 'HOPT810101': -3.400000,
399
+ 'JANJ780101': 34.700000,
400
+ 'KARP850101': 0.925000,
401
+ 'KYTJ820101': -0.900000,
402
+ 'ROSM880101': -2.860000,
403
+ 'VINM940101': 0.904000,
404
+ 'WERD780101': 0.860000,
405
+ 'ZIMJ680101': 0.310000,
406
+ 'ZIMJ680104': 5.890000,
407
+ },
408
+ 'Y': {
409
+ 'BIGC670101': 116.200000,
410
+ 'CHAM820101': 0.298000,
411
+ 'CHOP780201': 0.690000,
412
+ 'CHOP780202': 1.470000,
413
+ 'CHOP780203': 1.140000,
414
+ 'EISD860101': 1.600000,
415
+ 'FASG760101': 181.190000,
416
+ 'FAUJ830101': 0.960000,
417
+ 'GRAR740102': 6.200000,
418
+ 'GRAR740103': 136.000000,
419
+ 'GUYH850101': -0.210000,
420
+ 'HOPT810101': -2.300000,
421
+ 'JANJ780101': 55.200000,
422
+ 'KARP850101': 0.961000,
423
+ 'KYTJ820101': -1.300000,
424
+ 'ROSM880101': 0.980000,
425
+ 'VINM940101': 0.929000,
426
+ 'WERD780101': 0.640000,
427
+ 'ZIMJ680101': 2.970000,
428
+ 'ZIMJ680104': 5.660000,
429
+ },
430
+ 'V': {
431
+ 'BIGC670101': 85.100000,
432
+ 'CHAM820101': 0.140000,
433
+ 'CHOP780201': 1.060000,
434
+ 'CHOP780202': 1.700000,
435
+ 'CHOP780203': 0.590000,
436
+ 'EISD860101': 1.500000,
437
+ 'FASG760101': 117.150000,
438
+ 'FAUJ830101': 1.220000,
439
+ 'GRAR740102': 5.900000,
440
+ 'GRAR740103': 84.000000,
441
+ 'GUYH850101': -1.270000,
442
+ 'HOPT810101': -1.500000,
443
+ 'JANJ780101': 23.700000,
444
+ 'KARP850101': 0.982000,
445
+ 'KYTJ820101': 4.200000,
446
+ 'ROSM880101': -2.180000,
447
+ 'VINM940101': 0.931000,
448
+ 'WERD780101': 0.720000,
449
+ 'ZIMJ680101': 1.790000,
450
+ 'ZIMJ680104': 5.960000,
451
+ },
452
+ }
453
+
454
+ # Feature descriptions
455
+ FEATURE_DESCRIPTIONS = {
456
+ 'BIGC670101': 'Residue volume (Bigelow, 1967)',
457
+ 'CHAM820101': 'Polarizability parameter (Charton-Charton, 1982)',
458
+ 'CHOP780201': 'Normalized frequency of alpha-helix (Chou-Fasman, 1978b)',
459
+ 'CHOP780202': 'Normalized frequency of beta-sheet (Chou-Fasman, 1978b)',
460
+ 'CHOP780203': 'Normalized frequency of beta-turn (Chou-Fasman, 1978b)',
461
+ 'EISD860101': 'Solvation free energy (Eisenberg-McLachlan, 1986)',
462
+ 'FASG760101': 'Molecular weight (Fasman, 1976)',
463
+ 'FAUJ830101': 'Hydrophobic parameter pi (Fauchere-Pliska, 1983)',
464
+ 'GRAR740102': 'Polarity (Grantham, 1974)',
465
+ 'GRAR740103': 'Volume (Grantham, 1974)',
466
+ 'GUYH850101': 'Partition energy (Guy, 1985)',
467
+ 'HOPT810101': 'Hydrophilicity value (Hopp-Woods, 1981)',
468
+ 'JANJ780101': 'Average accessible surface area (Janin et al., 1978)',
469
+ 'KARP850101': 'Flexibility parameter for no rigid neighbors (Karplus-Schulz, 1985)',
470
+ 'KYTJ820101': 'Hydropathy index (Kyte-Doolittle, 1982)',
471
+ 'ROSM880101': 'Side chain hydropathy, uncorrected for solvation (Roseman, 1988)',
472
+ 'VINM940101': 'Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)',
473
+ 'WERD780101': 'Propensity to be buried inside (Wertz-Scheraga, 1978)',
474
+ 'ZIMJ680101': 'Hydrophobicity (Zimmerman et al., 1968)',
475
+ 'ZIMJ680104': 'Isoelectric point (Zimmerman et al., 1968)',
476
+ }
477
+
478
+ # Convert to numpy array
479
+ def get_feature_vector(aa, feature_list=None):
480
+ """
481
+ Get feature vector for an amino acid
482
+
483
+ Args:
484
+ aa: Amino acid single letter code
485
+ feature_list: List of feature codes to include (None = all)
486
+ Returns:
487
+ numpy array of features
488
+ """
489
+ if aa not in AA_PROPERTIES_AAINDEX:
490
+ aa = "A" # Default to Alanine
491
+
492
+ props = AA_PROPERTIES_AAINDEX[aa]
493
+
494
+ if feature_list is None:
495
+ feature_list = sorted(props.keys())
496
+
497
+ return np.array([props[f] for f in feature_list])
498
+
499
+
500
+ def get_sequence_features(sequence, feature_list=None):
501
+ """Get feature matrix for a sequence [L, N_features]"""
502
+ return np.array([get_feature_vector(aa, feature_list) for aa in sequence])
503
+ # Test
504
+ if __name__ == "__main__":
505
+ print("Loaded 20 features for 20 amino acids")
506
+ print("\nExample: Alanine (A)")
507
+ for key, value in list(AA_PROPERTIES_AAINDEX["A"].items())[:5]:
508
+ print(f" {key}: {value:.4f} - {FEATURE_DESCRIPTIONS[key][:50]}")
509
+
510
+ print("\nTest sequence features:")
511
+ seq = "ARNDCQEG"
512
+ features = get_sequence_features(seq)
513
+ print(f" Sequence: {seq}")
514
+ print(f" Feature matrix shape: {features.shape}")
src/library/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/library/hla_library/A_prot.fasta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:283b031c980e3e64fb3985da4012c9682bb6cbe1bef03ef85035b833a20c24b3
3
+ size 1350496
src/library/hla_library/B_prot.fasta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e5b7563254487f8ed55746a1fe90638b337fdb2a145923919208b511f898fbb
3
+ size 1623286
src/library/hla_library/C_prot.fasta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cecd489f0feca57e06c3dff8dd2e92f1142dfecae6b3b81cc7299b16e49567ed
3
+ size 1313293
src/library/hla_library/E_prot.fasta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eab5e5d1ebb1d9d50c5893b52a713390ff2121e1fef184ca0639f4cabff1e14
3
+ size 9590
src/library/hla_prot.fasta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dba02678d244bf6b1c5eb421704d8319ffe7c21deb04c4bab0ba1fabd38e147
3
+ size 13753408
src/library/trajs_aa.tsv ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Species Gene Allele AccNum Functionality aa_seq
2
+ Homosap TRAJ1 TRAJ1*01 M94081 F YESITSQLQFGKGTRVSTSP
3
+ Homosap TRAJ10 TRAJ10*01 M94081 F ILTGGGNKLTFGTGTQLKVEL
4
+ Homosap TRAJ11 TRAJ11*01 M94081 F NSGYSTLTFGKGTMLLVSP
5
+ Homosap TRAJ12 TRAJ12*01 X02885 F MDSSYKLIFGSGTRLLVRP
6
+ Homosap TRAJ13 TRAJ13*01 M94081 F NSGGYQKVTFGIGTKLQVIP
7
+ Homosap TRAJ13 TRAJ13*02 AB258131 F NSGGYQKVTFGTGTKLQVIP
8
+ Homosap TRAJ14 TRAJ14*01 M94081 F IYSTFIFGSGTRLSVKP
9
+ Homosap TRAJ15 TRAJ15*01 X05775 F NQAGTALIFGKGTTLSVSS
10
+ Homosap TRAJ15 TRAJ15*02 M94081 F NQAGTALIFGKGTHLSVSS
11
+ Homosap TRAJ16 TRAJ16*01 M94081 F FSDGQKLLFARGTMLKVDL
12
+ Homosap TRAJ16 TRAJ16*02 IMGT000024 F FSDGQKLLFARGTMLKVDL
13
+ Homosap TRAJ17 TRAJ17*01 X05773 F IKAAGNKLTFGGGTRVLVKP
14
+ Homosap TRAJ18 TRAJ18*01 M94081 F DRGSTLGRLYFGRGTQLTVWP
15
+ Homosap TRAJ2 TRAJ2*01 M94081 F NTGGTIDKLTFGKGTHVFIIS
16
+ Homosap TRAJ20 TRAJ20*01 M94081 F SNDYKLSFGAGTTVTVRA
17
+ Homosap TRAJ21 TRAJ21*01 M94081 F YNFNKFYFGSGTKLNVKP
18
+ Homosap TRAJ22 TRAJ22*01 X02886 F SSGSARQLTFGSGTQLTVLP
19
+ Homosap TRAJ23 TRAJ23*01 M94081 F IYNQGGKLIFGQGTELSVKP
20
+ Homosap TRAJ23 TRAJ23*02 X58763 F IYNQGGKLIFGQGTELSVKP
21
+ Homosap TRAJ24 TRAJ24*01 X02887 F TTDSWGKFEFGAGTQVVVTP
22
+ Homosap TRAJ24 TRAJ24*02 M94081 F TTDSWGKLQFGAGTQVVVTP
23
+ Homosap TRAJ24 TRAJ24*03 IMGT000024 F TTDSWGKFQFGAGTQVVVTP
24
+ Homosap TRAJ25 TRAJ25*01 M94081 F XEGQGFSFIFGKGTRLLVKP
25
+ Homosap TRAJ26 TRAJ26*01 M94081 F DNYGQNFVFGPGTRLSVLP
26
+ Homosap TRAJ27 TRAJ27*01 M94081 F NTNAGKSTFGDGTTLTVKP
27
+ Homosap TRAJ28 TRAJ28*01 M94081 F YSGAGSYQLTFGKGTKLSVIP
28
+ Homosap TRAJ29 TRAJ29*01 M94081 F NSGNTPLVFGKGTRLSVIA
29
+ Homosap TRAJ3 TRAJ3*01 X02884 F GYSSASKIIFGSGTRLSIRP
30
+ Homosap TRAJ30 TRAJ30*01 M94081 F NRDDKIIFGKGTRLHILP
31
+ Homosap TRAJ31 TRAJ31*01 M94081 F NNNARLMFGDGTQLVVKP
32
+ Homosap TRAJ32 TRAJ32*01 M94081 F NYGGATNKLIFGTGTLLAVQP
33
+ Homosap TRAJ32 TRAJ32*02 IMGT000024 F NYGGATNKLIFGTGTLLAVQP
34
+ Homosap TRAJ33 TRAJ33*01 M94081 F DSNYQLIWGAGTKLIIKP
35
+ Homosap TRAJ34 TRAJ34*01 M35622 F SYNTDKLIFGTGTRLQVFP
36
+ Homosap TRAJ35 TRAJ35*01 M94081 F IGFGNVLHCGSGTQVIVLP
37
+ Homosap TRAJ36 TRAJ36*01 M94081 F QTGANNLFFGTGTRLTVIP
38
+ Homosap TRAJ37 TRAJ37*01 M94081 F GSGNTGKLIFGQGTTLQVKP
39
+ Homosap TRAJ37 TRAJ37*02 IMGT000024 F GSSNTGKLIFGQGTTLQVKP
40
+ Homosap TRAJ38 TRAJ38*01 M94081 F NAGNNRKLIWGLGTSLAVNP
41
+ Homosap TRAJ39 TRAJ39*01 M94081 F NNNAGNMLTFGGGTRLMVKP
42
+ Homosap TRAJ4 TRAJ4*01 M94081 F FSGGYNKLIFGAGTRLAVHP
43
+ Homosap TRAJ40 TRAJ40*01 M35620 F TTSGTYKYIFGTGTRLKVLA
44
+ Homosap TRAJ41 TRAJ41*01 M94081 F NSNSGYALNFGKGTSLLVTP
45
+ Homosap TRAJ42 TRAJ42*01 M94081 F NYGGSQGNLIFGKGTKLSVKP
46
+ Homosap TRAJ43 TRAJ43*01 M94081 F NNNDMRFGAGTRLTVKP
47
+ Homosap TRAJ44 TRAJ44*01 M35619 F NTGTASKLTFGTGTRLQVTL
48
+ Homosap TRAJ45 TRAJ45*01 M94081 F YSGGGADGLTFGKGTHLIIQP
49
+ Homosap TRAJ46 TRAJ46*01 M94081 F KKSSGDKLTFGTGTRLAVRP
50
+ Homosap TRAJ47 TRAJ47*01 M94081 F EYGNKLVFGAGTILRVKS
51
+ Homosap TRAJ47 TRAJ47*02 AF033825 (F) EYGNKLVFGAGTILRVKS
52
+ Homosap TRAJ48 TRAJ48*01 M94081 F SNFGNEKLTFGTGTRLTIIP
53
+ Homosap TRAJ49 TRAJ49*01 M94081 F NTGNQFYFGTGTSLTVIP
54
+ Homosap TRAJ5 TRAJ5*01 M94081 F DTGRRALTFGSGTRLQVQP
55
+ Homosap TRAJ50 TRAJ50*01 M94081 F KTSYDKVIFGPGTSLSVIP
56
+ Homosap TRAJ52 TRAJ52*01 M94081 F NAGGTSYGKLTFGQGTILTVHP
57
+ Homosap TRAJ53 TRAJ53*01 M94081 F NSGGSNYKLTFGKGTLLTVNP
58
+ Homosap TRAJ54 TRAJ54*01 M94081 F IQGAQKLVFGQGTRLTINP
59
+ Homosap TRAJ56 TRAJ56*01 M94081 F YTGANSKLTFGKGITLSVRP
60
+ Homosap TRAJ57 TRAJ57*01 M94081 F TQGGSEKLVFGKGTKLTVNP
61
+ Homosap TRAJ58 TRAJ58*01 M94081 F ETSGSRLTFGEGTQLTVNP
62
+ Homosap TRAJ6 TRAJ6*01 M16747 F ASGGSYIPTFGRGTSLIVHP
63
+ Homosap TRAJ7 TRAJ7*01 M94081 F DYGNNRLAFGKGNQVVVIP
64
+ Homosap TRAJ8 TRAJ8*01 M94081 F NTGFQKLVFGTGTRLLVSP
65
+ Homosap TRAJ9 TRAJ9*01 M94081 F GNTGGFKTIFGAGTRLFVKA
src/library/trajs_nt.tsv ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Species Gene Allele AccNum Functionality nt_seq
2
+ Homosap TRAJ10 TRAJ10*01 M94081 F atactcacgggaggaggaaacaaactcacctttgggacaggcactcagctaaaagtggaactca
3
+ Homosap TRAJ11 TRAJ11*01 M94081 F tgaattcaggatacagcaccctcacctttgggaaggggactatgcttctagtctctccag
4
+ Homosap TRAJ12 TRAJ12*01 X02885 F ggatggatagcagctataaattgatcttcgggagtgggaccagactgctggtcaggcctg
5
+ Homosap TRAJ13 TRAJ13*01 M94081 F tgaattctgggggttaccagaaagttacctttggaattggaacaaagctccaagtcatcccaa
6
+ Homosap TRAJ13 TRAJ13*02 AB258131 F tgaattctgggggttaccagaaagttacctttggaactggaacaaagctccaagtcatcccaa
7
+ Homosap TRAJ14 TRAJ14*01 M94081 F atttatagcacattcatctttgggagtgggacaagattatcagtaaaacctg
8
+ Homosap TRAJ15 TRAJ15*01 X05775 F ccaaccaggcaggaactgctctgatctttgggaagggaaccaccttatcagtgagttcca
9
+ Homosap TRAJ15 TRAJ15*02 M94081 F ccaaccaggcaggaactgctctgatctttgggaagggaacccacctatcagtgagttcca
10
+ Homosap TRAJ16 TRAJ16*01 M94081 F ggttttcagatggccagaagctgctctttgcaaggggaaccatgttaaaggtggatctta
11
+ Homosap TRAJ16 TRAJ16*02 IMGT000024 F ggttttcagatggccagaagctgctctttgcaagggggaccatgttaaaggtggatctta
12
+ Homosap TRAJ17 TRAJ17*01 X05773 F tgatcaaagctgcaggcaacaagctaacttttggaggaggaaccagggtgctagttaaaccaa
13
+ Homosap TRAJ18 TRAJ18*01 M94081 F ccgacagaggctcaaccctggggaggctatactttggaagaggaactcagttgactgtctggcctg
14
+ Homosap TRAJ20 TRAJ20*01 M94081 F gttctaacgactacaagctcagctttggagccggaaccacagtaactgtaagagcaa
15
+ Homosap TRAJ21 TRAJ21*01 M94081 F tacaacttcaacaaattttactttggatctgggaccaaactcaatgtaaaaccaa
16
+ Homosap TRAJ22 TRAJ22*01 X02886 F tttcttctggttctgcaaggcaactgacctttggatctgggacacaattgactgttttacctg
17
+ Homosap TRAJ23 TRAJ23*01 M94081 F tgatttataaccagggaggaaagcttatcttcggacagggaacggagttatctgtgaaaccca
18
+ Homosap TRAJ23 TRAJ23*02 X58763 F tgatttataaccagggaggaaagcttatcttcggacagggaacggagctatctgtgaaaccca
19
+ Homosap TRAJ24 TRAJ24*01 X02887 F tgacaactgacagctgggggaaattcgagtttggagcagggacccaggttgtggtcaccccag
20
+ Homosap TRAJ24 TRAJ24*02 M94081 F tgacaactgacagctgggggaaattgcagtttggagcagggacccaggttgtggtcaccccag
21
+ Homosap TRAJ24 TRAJ24*03 IMGT000024 F tgacaactgacagctgggggaaattccagtttggagcagggacccaggttgtggtcaccccag
22
+ Homosap TRAJ26 TRAJ26*01 M94081 F gggataactatggtcagaattttgtctttggtcccggaaccagattgtccgtgctgccct
23
+ Homosap TRAJ27 TRAJ27*01 M94081 F taacaccaatgcaggcaaatcaacctttggggatgggactacgctcactgtgaagccaa
24
+ Homosap TRAJ28 TRAJ28*01 M94081 F catactctggggctgggagttaccaactcactttcgggaaggggaccaaactctcggtcataccaa
25
+ Homosap TRAJ29 TRAJ29*01 M94081 F ggaattcaggaaacacacctcttgtctttggaaagggcacaagactttctgtgattgcaa
26
+ Homosap TRAJ3 TRAJ3*01 X02884 F ggggtacagcagtgcttccaagataatctttggatcagggaccagactcagcatccggccaa
27
+ Homosap TRAJ30 TRAJ30*01 M94081 F tgaacagagatgacaagatcatctttggaaaagggacacgacttcatattctcccca
28
+ Homosap TRAJ31 TRAJ31*01 M94081 F ggaataacaatgccagactcatgtttggagatggaactcagctggtggtgaagccca
29
+ Homosap TRAJ32 TRAJ32*01 M94081 F tgaattatggcggtgctacaaacaagctcatctttggaactggcactctgcttgctgtccagccaa
30
+ Homosap TRAJ32 TRAJ32*02 IMGT000024 F tgaattatggtggtgctacaaacaagctcatctttggaactggcactctgcttgctgtccagccaa
31
+ Homosap TRAJ33 TRAJ33*01 M94081 F tggatagcaactatcagttaatctggggcgctgggaccaagctaattataaagccag
32
+ Homosap TRAJ34 TRAJ34*01 M35622 F tcttataacaccgacaagctcatctttgggactgggaccagattacaagtctttccaa
33
+ Homosap TRAJ35 TRAJ35*01 M94081 F gataggctttgggaatgtgctgcattgcgggtccggcactcaagtgattgttttaccac
34
+ Homosap TRAJ36 TRAJ36*01 M94081 F tcaaactggggcaaacaacctcttctttgggactggaacgagactcaccgttattccct
35
+ Homosap TRAJ37 TRAJ37*01 M94081 F tggctctggcaacacaggcaaactaatctttgggcaagggacaactttacaagtaaaaccag
36
+ Homosap TRAJ37 TRAJ37*02 IMGT000024 F tggctctagcaacacaggcaaactaatctttgggcaagggacaactttacaagtaaaaccag
37
+ Homosap TRAJ38 TRAJ38*01 M94081 F taatgctggcaacaaccgtaagctgatttggggattgggaacaagcctggcagtaaatccga
38
+ Homosap TRAJ39 TRAJ39*01 M94081 F tgaataataatgcaggcaacatgctcacctttggagggggaacaaggttaatggtcaaacccc
39
+ Homosap TRAJ4 TRAJ4*01 M94081 F tgttttctggtggctacaataagctgatttttggagcagggaccaggctggctgtacacccat
40
+ Homosap TRAJ40 TRAJ40*01 M35620 F actacctcaggaacctacaaatacatctttggaacaggcaccaggctgaaggttttagcaa
41
+ Homosap TRAJ41 TRAJ41*01 M94081 F gaactcaaattccgggtatgcactcaacttcggcaaaggcacctcgctgttggtcacacccc
42
+ Homosap TRAJ42 TRAJ42*01 M94081 F tgaattatggaggaagccaaggaaatctcatctttggaaaaggcactaaactctctgttaaaccaa
43
+ Homosap TRAJ43 TRAJ43*01 M94081 F acaataacaatgacatgcgctttggagcagggaccagactgacagtaaaaccaa
44
+ Homosap TRAJ44 TRAJ44*01 M35619 F taaataccggcactgccagtaaactcacctttgggactggaacaagacttcaggtcacgctcg
45
+ Homosap TRAJ45 TRAJ45*01 M94081 F tgtattcaggaggaggtgctgacggactcacctttggcaaagggactcatctaatcatccagccct
46
+ Homosap TRAJ46 TRAJ46*01 M94081 F agaagaaaagcagcggagacaagctgacttttgggaccgggactcgtttagcagttaggccca
47
+ Homosap TRAJ47 TRAJ47*01 M94081 F tggaatatggaaacaaactggtctttggcgcaggaaccattctgagagtcaagtcct
48
+ Homosap TRAJ47 TRAJ47*02 AF033825 (F) tggaatatggaaacaagctggtctttggcgcaggaaccattctgagagtcaagtcct
49
+ Homosap TRAJ48 TRAJ48*01 M94081 F tatctaactttggaaatgagaaattaacctttgggactggaacaagactcaccatcataccca
50
+ Homosap TRAJ49 TRAJ49*01 M94081 F gaacaccggtaaccagttctattttgggacagggacaagtttgacggtcattccaa
51
+ Homosap TRAJ5 TRAJ5*01 M94081 F tggacacgggcaggagagcacttacttttgggagtggaacaagactccaagtgcaaccaa
52
+ Homosap TRAJ50 TRAJ50*01 M94081 F tgaaaacctcctacgacaaggtgatatttgggccagggacaagcttatcagtcattccaa
53
+ Homosap TRAJ52 TRAJ52*01 M94081 F ctaatgctggtggtactagctatggaaagctgacatttggacaagggaccatcttgactgtccatccaa
54
+ Homosap TRAJ53 TRAJ53*01 M94081 F agaatagtggaggtagcaactataaactgacatttggaaaaggaactctcttaaccgtgaatccaa
55
+ Homosap TRAJ54 TRAJ54*01 M94081 F taattcagggagcccagaagctggtatttggccaaggaaccaggctgactatcaacccaa
56
+ Homosap TRAJ56 TRAJ56*01 M94081 F ttatactggagccaatagtaagctgacatttggaaaaggaataactctgagtgttagaccag
57
+ Homosap TRAJ57 TRAJ57*01 M94081 F taactcagggcggatctgaaaagctggtctttggaaagggaacgaaactgacagtaaacccat
58
+ Homosap TRAJ6 TRAJ6*01 M16747 F tgcatcaggaggaagctacatacctacatttggaagaggaaccagccttattgttcatccgt
59
+ Homosap TRAJ7 TRAJ7*01 M94081 F tgactatgggaacaacagactcgcttttgggaaggggaaccaagtggtggtcataccaa
60
+ Homosap TRAJ8 TRAJ8*01 M94081 F tgaacacaggctttcagaaacttgtatttggaactggcacccgacttctggtcagtccaa
61
+ Homosap TRAJ9 TRAJ9*01 M94081 F ggaaatactggaggcttcaaaactatctttggagcaggaacaagactatttgttaaagcaa
src/library/travs_aa.tsv ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Species Gene Allele AccNum Functionality aa_seq
2
+ Homosap TRAV1-1 TRAV1-1*01 AE000658 F GQSLEQPSEVTAVEGAIVQINCTYQTSGFYGLSWYQQHDGGAPTFLSYNALDGLEETGRFSSFLSRSDSYGYLLLQELQMKDSASYFCAVR
3
+ Homosap TRAV1-1 TRAV1-1*02 X04939 (F) GQSLEQPSEVTAVEGAIVQINCTYQTSGFYGLSWYQQHDGGAPTFLSYNGLDGLEETGRFSSFLSRSDSYGYLLLQELQMKDSASYFCA
4
+ Homosap TRAV1-2 TRAV1-2*01 AE000658 F GQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAPTFLSYNVLDGLEEKGRFSSFLSRSKGYSYLLLKELQMKDSASYLCAVR
5
+ Homosap TRAV1-2 TRAV1-2*02 U32544 [F] GQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAPTFLSYNVLDGLEEKG
6
+ Homosap TRAV1-2 TRAV1-2*03 IMGT000024 F GQNIDQPTEMTATEGAIVQINCTYQTSGFNGLFWYQQHAGEAPTFLSYNVLDGLEEKGRFSSFLSRSKGYSYLLLKELQMKDSASYLCAVR
7
+ Homosap TRAV10 TRAV10*01 AE000659 F KNQVEQSPQSLIILEGKNCTLQCNYTVSPFSNLRWYKQDTGRGPVSLTIMTFSENTKSNGRYTATLDADTKQSSLHITASQLSDSASYICVVS
8
+ Homosap TRAV10 TRAV10*02 IMGT000024 F KNQVEQSPQSLIILEGKNCTLQCNYTVSPFSNLRWYKQDTGRGPVSLTIMTFSENTKSNGRYTATLDADTKQSSLHITASQLSDSASYICVVS
9
+ Homosap TRAV12-1 TRAV12-1*01 AE000659 F RKEVEQDPGPFNVPEGATVAFNCTYSNSASQSFFWYRQDCRKEPKLLMSVYSSGNEDGRFTAQLNRASQYISLLIRDSKLSDSATYLCVVN
10
+ Homosap TRAV12-1 TRAV12-1*02 M17657 (F) RKEVEQDPGPFNVPEGATVAFNCTYSNSASQSFFWYRQDCRKEPKLLMSVYSSGNEDGRFTAHVNRASQYISLLIRDSKLSDSATYLCVVN
11
+ Homosap TRAV12-2 TRAV12-2*01 AE000659 F QKEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELIMFIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAVN
12
+ Homosap TRAV12-2 TRAV12-2*02 M81774 (F) QKEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELIMSIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAV
13
+ Homosap TRAV12-2 TRAV12-2*03 X04946 (F) GPLSVPEGAIASLNCTYSDRVSQSFFWYRQYSGKSPELIMSIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAVN
14
+ Homosap TRAV12-3 TRAV12-3*01 AE000659 F QKEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQYSRKGPELLMYTYSSGNKEDGRFTAQVDKSSKYISLFIRDSQPSDSATYLCAMS
15
+ Homosap TRAV12-3 TRAV12-3*02 M17656 (F) QKEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQYSRIGPELLMYTYSSGNKEDGRFTAQVDKSSKYISLFIRDSQPSDSATYLCAMS
16
+ Homosap TRAV13-1 TRAV13-1*01 AE000659 F GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKGPQLIIDIRSNVGEKKDQRIAVTLNKTAKHFSLHITETQPEDSAVYFCAAS
17
+ Homosap TRAV13-1 TRAV13-1*02 X04954 (F) GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKRPQLIIDIRSNVGEKKDQRIAVTLNKTAKHFSLHITETQPEDSAVYFCAAS
18
+ Homosap TRAV13-1 TRAV13-1*03 L11162 [F] GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKRPQLIIDIRSNVGEKKDQRIAVTLNKTAKHFSLQIT
19
+ Homosap TRAV13-2 TRAV13-2*01 AE000659 F GESVGLHLPTLSVQEGDNSIINCAYSNSASDYFIWYKQESGKGPQFIIDIRSNMDKRQGQRVTVLLNKTVKHLSLQIAATQPGDSAVYFCAEN
20
+ Homosap TRAV13-2 TRAV13-2*02 M17658 (F) GESVGLHLPTLSVQEGDNSIINCAYSNSASDYFIWYKQESGKGPQFIIDIRSNMDKRQGQRVTVLLNKTVKHLSLQIAATQPGDSAVYFCAE
21
+ Homosap TRAV14/DV4 TRAV14/DV4*01 M21626 F AQKITQTQPGMFVQEKEAVTLDCTYDTSDPSYGLFWYKQPSSGEMIFLIYQGSYDQQNATEGRYSLNFQKARKSANLVISASQLGDSAMYFCAMRE
22
+ Homosap TRAV14/DV4 TRAV14/DV4*02 AE000659 F AQKITQTQPGMFVQEKEAVTLDCTYDTSDQSYGLFWYKQPSSGEMIFLIYQGSYDEQNATEGRYSLNFQKARKSANLVISASQLGDSAMYFCAMRE
23
+ Homosap TRAV14/DV4 TRAV14/DV4*03 M21624 (F) AQKITQTQPGMFVQEKEAVTLDCTYDTSDPSYGLFWYKQPSSGEMIFLIYQGSYDQQNATEGRYSLNFQKARKSANLVISASQLGDSAMYFCAM
24
+ Homosap TRAV14/DV4 TRAV14/DV4*04 L09758 [F] QKITQTQPGMFVQEKEAVTLDCTYDTSDQSYGLFWYKQPSSGEMIFLIYQGSYDEQNATEGRYSLNFQKARKSANLVISASQLGDSAMYF
25
+ Homosap TRAV16 TRAV16*01 AE000659 F AQRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSRQRLQLLLRHISRESIKGFTADLNKGETSFHLKKPFAQEEDSAMYYCALS
26
+ Homosap TRAV17 TRAV17*01 AE000660 F SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHLILIRSNEREKHSGRLRVTLDTSKKSSSLLITASRAADTASYFCATD
27
+ Homosap TRAV18 TRAV18*01 AE000660 F GDSVTQTEGPVTLPERAALTLNCTYQSSYSTFLFWYVQYLNKEPELLLKSSENQETDSRGFQASPIKSDSSFHLEKPSVQLSDSAVYYCALR
28
+ Homosap TRAV19 TRAV19*01 AE000660 F AQKVTQAQTEISVVEKEDVTLDCVYETRDTTYYLFWYKQPPSGELVFLIRRNSFDEQNEISGRYSWNFQKSTSSFNFTITASQVVDSAVYFCALSE
29
+ Homosap TRAV2 TRAV2*01 AE000658 F KDQVFQPSTVASSEGAVVEIFCNHSVSNAYNFFWYLHFPGCAPRLLVKGSKPSQQGRYNMTYERFSSSLLILQVREADAAVYYCAVE
30
+ Homosap TRAV2 TRAV2*02 M17659 (F) KDQVFQPSTVASSEGAVVEIFCNHSVSNAYNFFWHLHFPGCAPRLLVKGSKPSQQGRYNMTYERFSSSLLILQVREADAAVYYCAVAW
31
+ Homosap TRAV20 TRAV20*01 AE000660 F EDQVTQSPEALRLQEGESSSLNCSYTVSGLRGLFWYRQDPGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLCAVQ
32
+ Homosap TRAV20 TRAV20*02 IMGT000024 F EDQVTQSPEALRLQEGESSSLNCSYTVSGLRGLFWYRQDPGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLCAVQ
33
+ Homosap TRAV20 TRAV20*03 S60789 (F) EDQVTQSPEALRLQEGESRSLNCSYTVSGLRGLFWYRQDPGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLC
34
+ Homosap TRAV20 TRAV20*04 X70305 (F) EDQVTQSPEALRLQEGESSSLNCSCTVSGLRGLFWYRQDPGKGPEFLFTLYSAGEEKEKERLKATLTKKESFLHITAPKPEDSATYLCA
35
+ Homosap TRAV21 TRAV21*01 AE000660 F KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTSLLLIQSSQREQTSGRLNASLDKSSGRSTLYIAASQPGDSATYLCAVR
36
+ Homosap TRAV21 TRAV21*02 X58736 (F) KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTSLLLIQSSQREQTSGRLNASLDKSSGRSTLYIAASQPGDSATYLCA
37
+ Homosap TRAV22 TRAV22*01 AE000660 F GIQVEQSPPDLILQEGANSTLRCNFSDSVNNLQWFHQNPWGQLINLFYIPSGTKQNGRLSATTVATERYSLLYISSSQTTDSGVYFCAVE
38
+ Homosap TRAV23/DV6 TRAV23/DV6*01 AE000660 F QQQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSLHIMDSQPGDSATYFCAAS
39
+ Homosap TRAV23/DV6 TRAV23/DV6*02 M17660 (F) QQQVKQSPQSLIVQKGGIPIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSLHIMDSQPGDSATYFCAAS
40
+ Homosap TRAV23/DV6 TRAV23/DV6*03 M97704 (F) QQQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSLHIMDSQPGDSATYFCAAS
41
+ Homosap TRAV23/DV6 TRAV23/DV6*04 Y10411 [F] QQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSLHIMDSQPGDSATYFC
42
+ Homosap TRAV23/DV6 TRAV23/DV6*05 IMGT000024 F QQQVKQSPQSLIVQKGGISIINCAYENTAFDYFPWYQQFPGKGPALLIAIRPDVSEKKEGRFTISFNKSAKQFSSHIMDSQPGDSATYFCAAS
43
+ Homosap TRAV24 TRAV24*01 AE000660 F ILNVEQSPQSLHVQEGDSTNFTCSFPSSNFYALHWYRWETAKSPEALFVMTLNGDEKKKGRISATLNTKEGYSYLYIKGSQPEDSATYLCAF
44
+ Homosap TRAV24 TRAV24*02 M17661 (F) ILNVEQGPQSLHVQEGDSTNFTCSFPSSNFYALHWYRWETAKTPEALFVMTLNGDEKKKGRISATLNTKEGYSYLYIKGSQPEDSATYLCAF
45
+ Homosap TRAV25 TRAV25*01 AE000660 F GQQVMQIPQYQHVQEGEDFTTYCNSSTTLSNIQWYKQRPGGHPVFLIQLVKSGEVKKQKRLTFQFGEAKKNSSLHITATQTTDVGTYFCAG
46
+ Homosap TRAV26-1 TRAV26-1*01 AE000660 F DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCIVRV
47
+ Homosap TRAV26-1 TRAV26-1*02 IMGT000024 F DAKTTQPTSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCIVRV
48
+ Homosap TRAV26-1 TRAV26-1*03 L06886 (F) DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQNIIHGLKNNETNEMASLIITEDRKSSTLILPHATLRDTAVYYCI
49
+ Homosap TRAV26-2 TRAV26-2*01 AE000660 F DAKTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVIHGLTSNVNNRMASLAIAEDRKSSTLILHRATLRDAAVYYCILRD
50
+ Homosap TRAV26-2 TRAV26-2*02 L11160 [F] DAKTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVIHGLTSNVNNRMACVAIAEDRKSST
51
+ Homosap TRAV27 TRAV27*01 AE000660 F TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLLVTVVTGGEVKKLKRLTFQFGDARKDSSLHITAAQPGDTGLYLCAG
52
+ Homosap TRAV27 TRAV27*02 X04957 (F) TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLLVTVVTGGEVKKLKRLTFQFGDARKDSSLHITAAQPGDTGHYLCA
53
+ Homosap TRAV27 TRAV27*03 IMGT000024 F TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLLVTVVTGGEVKKLKRLTFQFGDARKDSSLHITAAQTGDTGLYLCAG
54
+ Homosap TRAV29/DV5 TRAV29/DV5*01 AE000660 F DQQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFLISISSIKDKNEDGRFTVFLNKSAKHLSLHIVPSQPGDSAVYFCAAS
55
+ Homosap TRAV29/DV5 TRAV29/DV5*02 S81645 F DQQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFLISISSIKDKNEDGRFTVFLNKSAKHLSLDIVPSQPGDSAVYFCAAS
56
+ Homosap TRAV29/DV5 TRAV29/DV5*04 IMGT000024 F DQQVKQNSPSLSVQEGRISILNCDYTNSMFDYFLWYKKYPAEGPTFLISISSIKDKNEDGRFTVFLNKSAKHLSLHIVPSQPGDSAVYFCAAS
57
+ Homosap TRAV3 TRAV3*01 AE000658 F AQSVAQPEDQVNVAEGNPLTVKCTYSVSGNPYLFWYVQYPNRGLQFLLKYITGDNLVKGSYGFEAEFNKSQTSFHLKKPSALVSDSALYFCAVRD
58
+ Homosap TRAV30 TRAV30*01 AE000660 F QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQKGHEKISASFNEKKQQSSLYLTASQLSYSGTYFCGTE
59
+ Homosap TRAV30 TRAV30*02 X58768 (F) QQPVQSPQAVILREGEDAVTNCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQMRREKISASFNEKKQQSSLYLTASQLSYSGTYFCG
60
+ Homosap TRAV30 TRAV30*03 L06883 (F) QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQKGHEKISASFNEKKRQSSLYLTASQLSYSGTYFCG
61
+ Homosap TRAV30 TRAV30*04 U32537 [F] QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQKRHEKISASFNEKKQQSSLYLT
62
+ Homosap TRAV30 TRAV30*05 IMGT000024 F QQPVQSPQAVILREGEDAVINCSSSKALYSVHWYRQKHGEAPVFLMILLKGGEQKGHDKISASFNEKKQQSSLYLTASQLSYSGTYFCGTE
63
+ Homosap TRAV34 TRAV34*01 AE000660 F SQELEQSPQSLIVQEGKNLTINCTSSKTLYGLYWYKQKYGEGLIFLMMLQKGGEEKSHEKITAKLDEKKQQSSLHITASQPSHAGIYLCGAD
64
+ Homosap TRAV35 TRAV35*01 AE000660 F GQQLNQSPQSMFIQEGEDVSMNCTSSSIFNTWLWYKQEPGEGPVLLIALYKAGELTSNGRLTAQFGITRKDSFLNISASIPSDVGIYFCAGQ
65
+ Homosap TRAV35 TRAV35*02 X58738 (F) GQQLNQSPQSMFIQEGEDVSMNCTSSSIFNTWLWYKQDPGEGPVLLIALYKAGELTSNGRLTAQFGITRKDSFLNISASIPSDVGIYFCA
66
+ Homosap TRAV36/DV7 TRAV36/DV7*01 AE000660 F EDKVVQSPLSLVVHEGDTVTLNCSYEVTNFRSLLWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELSSILNITATQTGDSAIYLCAVE
67
+ Homosap TRAV36/DV7 TRAV36/DV7*02 X61070 (F) EDKVVQSPQSLVVHEGDTVTLNCSYEMTNFRSLQWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAVYLCAV
68
+ Homosap TRAV36/DV7 TRAV36/DV7*03 X58767 (F) EDKVVQSPLSLVVHEGDTVTPNCSYEVTNFRSLLWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAVYLCA
69
+ Homosap TRAV36/DV7 TRAV36/DV7*04 Z46643 (F) EDKVVQSPLSLVVHEGDTVTLNCSYEVTNFRSLLWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAVYLCA
70
+ Homosap TRAV36/DV7 TRAV36/DV7*05 IMGT000024 F EDKVVQSPLSLVVHEGDTVTLNCSYEVTNFRSLLWYKQEKKAPTFLFMLTSSGIEKKSGRLSSILDKKELFSILNITATQTGDSAIYLCAVE
71
+ Homosap TRAV38-1 TRAV38-1*01 AE000661 F AQTVTQSQPEMSVQEAETVTLSCTYDTSENNYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCAFMK
72
+ Homosap TRAV38-1 TRAV38-1*02 M64355 (F) AQTVTQSQPEMSVQEAETVTLSCTYDTSENDYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCA
73
+ Homosap TRAV38-1 TRAV38-1*03 M95394 (F) AQTVTQSQPEMSVQEAETVTLSCTYDTSESNYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCAF
74
+ Homosap TRAV38-1 TRAV38-1*04 L06880 (F) AQTVTQSQPEMSVQEAETVTLSCTYDTSENNYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDTAMYFCA
75
+ Homosap TRAV38-2/DV8 TRAV38-2/DV8*01 AE000661 F AQTVTQSQPEMSVQEAETVTLSCTYDTSESDYYLFWYKQPPSRQMILVIRQEAYKQQNATENRFSVNFQKAAKSFSLKISDSQLGDAAMYFCAYRS
76
+ Homosap TRAV39 TRAV39*01 AE000661 F ELKVEQNPLFLSMQEGKNYTIYCNYSTTSDRLYWYRQDPGKSLESLFVLLSNGAVKQEGRLMASLDTKARLSTLHITAAVHDLSATYFCAVD
77
+ Homosap TRAV4 TRAV4*01 AE000658 F LAKTTQPISMDSYEGQEVNITCSHNNIATNDYITWYQQFPSQGPRFIIQGYKTKVTNEVASLFIPADRKSSTLSLPRVSLSDTAVYYCLVGD
78
+ Homosap TRAV40 TRAV40*01 X73521 F SNSVKQTGQITVSEGASVTMNCTYTSTGYPTLFWYVEYPSKPLQLLQRETMENSKNFGGGNIKDKNSPIVKYSVQVSDSAVYYCLLG
79
+ Homosap TRAV41 TRAV41*01 AE000661 F KNEVEQSPQNLTAQEGEFITINCSYSVGISALHWLQQHPGGGIVSLFMLSSGKKKHGRLIATINIQEKHSSLHITASHPRDSAVYICAVR
80
+ Homosap TRAV5 TRAV5*01 AE000659 F GEDVEQSLFLSVREGDSSVINCTYTDSSSTYLYWYKQEPGAGLQLLTYIFSNMDMKQDQRLTVLLNKKDKHLSLRIADTQTGDSAIYFCAES
81
+ Homosap TRAV6 TRAV6*01 AE000659 F SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCALD
82
+ Homosap TRAV6 TRAV6*02 X58747 (F) SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCA
83
+ Homosap TRAV6 TRAV6*03 Z49060 [F] EALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCA
84
+ Homosap TRAV6 TRAV6*04 Y10409 [F] EALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHVTASQPADSATYLCA
85
+ Homosap TRAV6 TRAV6*05 Y10410 [F] EALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCA
86
+ Homosap TRAV6 TRAV6*06 U32542 [F] SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLNQ
87
+ Homosap TRAV6 TRAV6*07 IMGT000024 F SQKIEQNSEALNIQEGKTATLTCNYTNYSPAYLQWYRQDPGRGPVFLLLIRENEKEKRKERLKVTFDTTLKQSLFHITASQPADSATYLCALD
88
+ Homosap TRAV7 TRAV7*01 AE000659 F ENQVEHSPHFLGPQQGDVASMSCTYSVSRFNNLQWYRQNTGMGPKHLLSMYSAGYEKQKGRLNATLLKNGSSLYITAVQPEDSATYFCAVD
89
+ Homosap TRAV8-1 TRAV8-1*01 AE000659 F AQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQLLLKYFSGDPLVKGIKGFEAEFIKSKFSFNLRKPSVQWSDTAEYFCAVN
90
+ Homosap TRAV8-1 TRAV8-1*02 U32520 [F] AQSVSQHNHHVILSEAASLELGCNYSYGGTVNLFWYVQYPGQHLQLLLKYFSGDPLVKGIKGVEAEFIKSKFSFNLRKPSVQW
91
+ Homosap TRAV8-2 TRAV8-2*01 AE000659 F AQSVTQLDSHVSVSEGTPVLLRCNYSSSYSPSLFWYVQHPNKGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCVVS
92
+ Homosap TRAV8-2 TRAV8-2*02 M17650 (F) AQSVTQLSSHVSVSEGTPVLLRCNYSSSYSPSLFWYVQHPNKGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCVV
93
+ Homosap TRAV8-2 TRAV8-2*03 IMGT000024 F AQSVTQLDSHVSVSEGTPVLLRCNYSSSYSPSLFWYVQHPNKGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCVVS
94
+ Homosap TRAV8-3 TRAV8-3*01 AE000659 F AQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDAAEYFCAVG
95
+ Homosap TRAV8-3 TRAV8-3*02 M35617 (F) AQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDAAEYFCAVV
96
+ Homosap TRAV8-3 TRAV8-3*03 L06885 (F) AQSVTQPDIHITVSEGASLELRCNYSYGATPYLFWYVQSPGQGLQLLLKYFSGDTLVQGIKGFEAEFKRSQSSFNLRKPSVHWSDASEYFCA
97
+ Homosap TRAV8-4 TRAV8-4*01 AE000659 F AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
98
+ Homosap TRAV8-4 TRAV8-4*02 M12423 (F) AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
99
+ Homosap TRAV8-4 TRAV8-4*03 D13077 (F) AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTTGATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCA
100
+ Homosap TRAV8-4 TRAV8-4*04 M12959 (F) AQSVTQLGSHVSVSERALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
101
+ Homosap TRAV8-4 TRAV8-4*05 X63455 (F) AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVS
102
+ Homosap TRAV8-4 TRAV8-4*06 K02777 (F) GATHYCCPPILFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPAAHMSDAAEYFCAVS
103
+ Homosap TRAV8-4 TRAV8-4*07 M17665 (F) VEPYLFWYVQYPNQGLQLLLKYTTGATLVKGINGFEAEFKKSETSFHLTKPSAHMTDPAEYFCAV
104
+ Homosap TRAV8-6 TRAV8-6*01 X02850 F AQSVTQLDSQVPVFEEAPVELRCNYSSSVSVYLFWYVQYPNQGLQLLLKYLSGSTLVESINGFEAEFNKSQTSFHLRKPSVHISDTAEYFCAVS
105
+ Homosap TRAV8-6 TRAV8-6*02 AE000659 F AQSVTQLDSQVPVFEEAPVELRCNYSSSVSVYLFWYVQYPNQGLQLLLKYLSGSTLVKGINGFEAEFNKSQTSFHLRKPSVHISDTAEYFCAVS
106
+ Homosap TRAV9-1 TRAV9-1*01 AE000659 F GDSVVQTEGQVLPSEGDSLIVNCSYETTQYPSLFWYVQYPGEGPQLHLKAMKANDKGRNKGFEAMYRKETTSFHLEKDSVQESDSAVYFCALS
107
+ Homosap TRAV9-2 TRAV9-2*01 AE000659 F GNSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPGEGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCALS
108
+ Homosap TRAV9-2 TRAV9-2*02 IMGT000024 F GDSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPGEGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCALS
109
+ Homosap TRAV9-2 TRAV9-2*03 L06881 (F) GDSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPGEGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCA
110
+ Homosap TRAV9-2 TRAV9-2*04 L06882 (F) GNSVTQMEGPVTLSEEAFLTINCTYTATGYPSLFWYVQYPGEGLQLLLKATKADDKGSNKGFEATYRKETTSFHLEKGSVQVSDSAVYFCA
src/library/travs_nt.tsv ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Species Gene Allele AccNum Functionality nt_seq
2
+ Homosap TRAV1-1 TRAV1-1*01 AE000658 F ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagataaactgcacgtaccagacatctgggttttatgggctgtcctggtaccagcaacatgatggcggagcacccacatttctttcttacaatgctctggatggtttggaggagacaggtcgtttttcttcattccttagtcgctctgatagttatggttacctccttctacaggagctccagatgaaagactctgcctcttacttctgcgctgtgagaga
3
+ Homosap TRAV1-1 TRAV1-1*02 X04939 (F) ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagataaactgcacgtaccagacatctgggttttatgggctgtcctggtaccagcaacatgatggcggagcacccacatttctttcttacaatggtctggatggtttggaggagacaggtcgtttttcttcattccttagtcgctctgatagttatggttacctccttctacaggagctccagatgaaagactctgcctcttacttctgcgctgt
4
+ Homosap TRAV1-2 TRAV1-2*01 AE000658 F ggacaaaacattgaccagcccactgagatgacagctacggaaggtgccattgtccagatcaactgcacgtaccagacatctgggttcaacgggctgttctggtaccagcaacatgctggcgaagcacccacatttctgtcttacaatgttctggatggtttggaggagaaaggtcgtttttcttcattccttagtcggtctaaagggtacagttacctccttttgaaggagctccagatgaaagactctgcctcttacctctgtgctgtgagaga
5
+ Homosap TRAV1-2 TRAV1-2*02 U32544 [F] ggacaaaacattgaccagcccactgagatgacagctacggaaggtgccattgtccagatcaactgcacgtaccagacatctgggttcaacgggctgttctggtaccagcaacatgctggcgaagcacccacatttctgtcttacaatgttctggatggtctggaggagaaaggtcg
6
+ Homosap TRAV1-2 TRAV1-2*03 IMGT000024 F ggacaaaacattgaccagcccactgagatgacagctacggaaggtgccattgtccagatcaactgcacgtaccagacatctgggttcaacgggctgttctggtaccagcaacatgctggcgaagcacctacatttctgtcttacaatgttctggatggtttggaggagaaaggtcgtttttcttcattccttagtcggtctaaagggtacagttacctccttttgaaggagctccagatgaaagactctgcctcttacctctgtgctgtgagaga
7
+ Homosap TRAV10 TRAV10*01 AE000659 F aaaaaccaagtggagcagagtcctcagtccctgatcatcctggagggaaagaactgcactcttcaatgcaattatacagtgagccccttcagcaacttaaggtggtataagcaagatactgggagaggtcctgtttccctgacaatcatgactttcagtgagaacacaaagtcgaacggaagatatacagcaactctggatgcagacacaaagcaaagctctctgcacatcacagcctcccagctcagcgattcagcctcctacatctgtgtggtgagcg
8
+ Homosap TRAV10 TRAV10*02 IMGT000024 F aaaaaccaagtggagcagagtcctcagtccctgatcatcctggagggaaagaactgcactcttcaatgcaattatacagtgagccccttcagcaacttaaggtggtataagcaagatacggggagaggtcctgtttccctgacaatcatgactttcagtgagaacacaaagtcgaacggaagatatacagcaactctggatgcagacacaaagcaaagctctctgcacatcacagcctcccagctcagcgattcagcctcctacatctgtgtggtgagcg
9
+ Homosap TRAV12-1 TRAV12-1*01 AE000659 F cggaaggaggtggagcaggatcctggacccttcaatgttccagagggagccactgtcgctttcaactgtacttacagcaacagtgcttctcagtctttcttctggtacagacaggattgcaggaaagaacctaagttgctgatgtccgtatactccagtggtaatgaagatggaaggtttacagcacagctcaatagagccagccagtatatttccctgctcatcagagactccaagctcagtgattcagccacctacctctgtgtggtgaaca
10
+ Homosap TRAV12-1 TRAV12-1*02 M17657 (F) cggaaggaggtggagcaggatcctggacccttcaatgttccagagggagccactgtcgctttcaactgtacttacagcaacagtgcttctcagtctttcttctggtacagacaggattgcaggaaagaacctaagttgctgatgtccgtatactccagtggtaatgaagatggaaggtttacagcacacgtcaatagagccagccagtatatttccctgctcatcagagactccaagctcagtgattcagccacctacctctgtgtggtgaaca
11
+ Homosap TRAV12-2 TRAV12-2*01 AE000659 F cagaaggaggtggagcagaattctggacccctcagtgttccagagggagccattgcctctctcaactgcacttacagtgaccgaggttcccagtccttcttctggtacagacaatattctgggaaaagccctgagttgataatgttcatatactccaatggtgacaaagaagatggaaggtttacagcacagctcaataaagccagccagtatgtttctctgctcatcagagactcccagcccagtgattcagccacctacctctgtgccgtgaaca
12
+ Homosap TRAV12-2 TRAV12-2*02 M81774 (F) cagaaggaggtggagcagaattctggacccctcagtgttccagagggagccattgcctctctcaactgcacttacagtgaccgaggttcccagtccttcttctggtacagacaatattctgggaaaagccctgagttgataatgtccatatactccaatggtgacaaagaagatggaaggtttacagcacagctcaataaagccagccagtatgtttctctgctcatcagagactcccagcccagtgattcagccacctacctctgtgccgtg
13
+ Homosap TRAV12-2 TRAV12-2*03 X04946 (F) ggacccctcagtgttccagagggagccattgcctctctcaactgcacttacagtgaccgagtttcccagtccttcttctggtacagacaatattctgggaaaagccctgagttgataatgtccatatactccaatggtgacaaagaagatggaaggtttacagcacagctcaataaagccagccagtatgtttctctgctcatcagagactcccagcccagtgattcagccacctacctctgtgccgtgaac
14
+ Homosap TRAV12-3 TRAV12-3*01 AE000659 F cagaaggaggtggagcaggatcctggaccactcagtgttccagagggagccattgtttctctcaactgcacttacagcaacagtgcttttcaatacttcatgtggtacagacagtattccagaaaaggccctgagttgctgatgtacacatactccagtggtaacaaagaagatggaaggtttacagcacaggtcgataaatccagcaagtatatctccttgttcatcagagactcacagcccagtgattcagccacctacctctgtgcaatgagcg
15
+ Homosap TRAV12-3 TRAV12-3*02 M17656 (F) cagaaggaggtggagcaggatcctggaccactcagtgttccagagggagccattgtttctctcaactgcacttacagcaacagtgcttttcaatacttcatgtggtacagacagtattccagaataggccctgagttgctgatgtacacatactccagtggtaacaaagaagatggaaggtttacagcacaggtcgataaatccagcaagtatatctccttgttcatcagagactcacagcccagtgattcagccacctacctctgtgcaatgagcg
16
+ Homosap TRAV13-1 TRAV13-1*01 AE000659 F ggagagaatgtggagcagcatccttcaaccctgagtgtccaggagggagacagcgctgttatcaagtgtacttattcagacagtgcctcaaactacttcccttggtataagcaagaacttggaaaaggacctcagcttattatagacattcgttcaaatgtgggcgaaaagaaagaccaacgaattgctgttacattgaacaagacagccaaacatttctccctgcacatcacagagacccaacctgaagactcggctgtctacttctgtgcagcaagta
17
+ Homosap TRAV13-1 TRAV13-1*02 X04954 (F) ggagagaatgtggagcagcatccttcaaccctgagtgtccaggagggagacagcgctgttatcaagtgtacttattcagacagtgcctcaaactacttcccttggtataagcaagaacttggaaaaagacctcagcttattatagacattcgttcaaatgtgggcgaaaagaaagaccaacgaattgctgttacattgaacaagacagccaaacatttctccctgcacatcacagagacccaacctgaagactcggctgtctacttctgtgcagcaagta
18
+ Homosap TRAV13-1 TRAV13-1*03 L11162 [F] ggagagaatgtggagcagcatccttcaaccctgagtgtccaggagggagacagcgctgttatcaagtgtacttattcagacagtgcctcaaactacttcccttggtataagcaagaacttggaaaaagacctcagcttattatagacattcgttcaaatgtgggcgaaaagaaagaccaacgaattgctgttacattgaacaagacagccaaacatttctccctgcagatcaca
19
+ Homosap TRAV13-2 TRAV13-2*01 AE000659 F ggagagagtgtggggctgcatcttcctaccctgagtgtccaggagggtgacaactctattatcaactgtgcttattcaaacagcgcctcagactacttcatttggtacaagcaagaatctggaaaaggtcctcaattcattatagacattcgttcaaatatggacaaaaggcaaggccaaagagtcaccgttttattgaataagacagtgaaacatctctctctgcaaattgcagctactcaacctggagactcagctgtctacttttgtgcagagaata
20
+ Homosap TRAV13-2 TRAV13-2*02 M17658 (F) ggagagagtgtggggctgcatcttcctaccctgagtgtccaggagggtgacaactctattatcaactgtgcttattcaaacagcgcctcagactacttcatttggtacaaacaagaatctggaaaaggtcctcaattcattatagacattcgttcaaatatggacaaaaggcaaggccaaagagtcaccgttttattgaataagacagtgaaacatctctctctgcaaattgcagctactcaacctggagactcagctgtctacttttgtgcagaga
21
+ Homosap TRAV14/DV4 TRAV14/DV4*01 M21626 F gcccagaagataactcaaacccaaccaggaatgttcgtgcaggaaaaggaggctgtgactctggactgcacatatgacaccagtgatccaagttatggtctattctggtacaagcagcccagcagtggggaaatgatttttcttatttatcaggggtcttatgaccagcaaaatgcaacagaaggtcgctactcattgaatttccagaaggcaagaaaatccgccaaccttgtcatctccgcttcacaactgggggactcagcaatgtacttctgtgcaatgagagaggg
22
+ Homosap TRAV14/DV4 TRAV14/DV4*02 AE000659 F gcccagaagataactcaaacccaaccaggaatgttcgtgcaggaaaaggaggctgtgactctggactgcacatatgacaccagtgatcaaagttatggtctattctggtacaagcagcccagcagtggggaaatgatttttcttatttatcaggggtcttatgacgagcaaaatgcaacagaaggtcgctactcattgaatttccagaaggcaagaaaatccgccaaccttgtcatctccgcttcacaactgggggactcagcaatgtatttctgtgcaatgagagaggg
23
+ Homosap TRAV14/DV4 TRAV14/DV4*03 M21624 (F) gcccagaagataactcaaacccaaccaggaatgttcgtgcaggaaaaggaggctgtgactctggactgcacatatgacaccagtgatccaagttatggtctattctggtacaagcagcccagcagtggggaaatgatttttcttatttatcaggggtcttatgaccagcaaaatgcaacagaaggtcgctactcattgaatttccagaaggcaagaaaatccgccaaccttgtcatctccgcttcacaactgggggactcagcaatgtatttctgtgcaatg
24
+ Homosap TRAV14/DV4 TRAV14/DV4*04 L09758 [F] cagaagataactcaaacccaaccaggaatgttcgtgcaggaaaaggaggctgtgactctggactgcacatatgacaccagtgatcaaagttatggtctcttctggtacaagcagcccagcagtggggaaatgatttttcttatttatcaggggtcttatgacgagcaaaatgcaacagaaggtcgctactcattgaatttccagaaggcaagaaaatccgccaaccttgtcatctccgcttcacaactgggggactcagcaatgtacttct
25
+ Homosap TRAV16 TRAV16*01 AE000659 F gcccagagagtgactcagcccgagaagctcctctctgtctttaaaggggccccagtggagctgaagtgcaactattcctattctgggagtcctgaactcttctggtatgtccagtactccagacaacgcctccagttactcttgagacacatctctagagagagcatcaaaggcttcactgctgaccttaacaaaggcgagacatctttccacctgaagaaaccatttgctcaagaggaagactcagccatgtattactgtgctctaagtgg
26
+ Homosap TRAV17 TRAV17*01 AE000660 F agtcaacagggagaagaggatcctcaggccttgagcatccaggagggtgaaaatgccaccatgaactgcagttacaaaactagtataaacaatttacagtggtatagacaaaattcaggtagaggccttgtccacctaattttaatacgttcaaatgaaagagagaaacacagtggaagattaagagtcacgcttgacacttccaagaaaagcagttccttgttgatcacggcttcccgggcagcagacactgcttcttacttctgtgctacggacg
27
+ Homosap TRAV18 TRAV18*01 AE000660 F ggagactcggttacccagacagaaggcccagttaccctccctgagagggcagctctgacattaaactgcacttatcagtccagctattcaacttttctattctggtatgtccagtatctaaacaaagagcctgagctcctcctgaaaagttcagaaaaccaggagacggacagcagaggttttcaggccagtcctatcaagagtgacagttccttccacctggagaagccctcggtgcagctgtcggactctgccgtgtactactgcgctctgagaga
28
+ Homosap TRAV19 TRAV19*01 AE000660 F gctcagaaggtaactcaagcgcagactgaaatttctgtggtggagaaggaggatgtgaccttggactgtgtgtatgaaacccgtgatactacttattacttattctggtacaagcaaccaccaagtggagaattggttttccttattcgtcggaactcttttgatgagcaaaatgaaataagtggtcggtattcttggaacttccagaaatccaccagttccttcaacttcaccatcacagcctcacaagtcgtggactcagcagtatacttctgtgctctgagtgaggc
29
+ Homosap TRAV2 TRAV2*01 AE000658 F aaggaccaagtgtttcagccttccacagtggcatcttcagagggagctgtggtggaaatcttctgtaatcactctgtgtccaatgcttacaacttcttctggtaccttcacttcccgggatgtgcaccaagactccttgttaaaggctcaaagccttctcagcagggacgatacaacatgacctatgaacggttctcttcatcgctgctcatcctccaggtgcgggaggcagatgctgctgtttactactgtgctgtggagga
30
+ Homosap TRAV2 TRAV2*02 M17659 (F) aaggaccaagtgtttcagccttccacagtggcatcttcagagggagctgtggtggaaatcttctgtaatcactctgtgtccaatgcttacaacttcttctggcaccttcacttcccgggatgtgcaccaagactccttgttaaaggctcaaagccttctcagcagggacgatacaacatgacctatgaacggttctcttcatcgctgctcatcctccaggtgcgggaggcagatgctgctgtttactactgtgctgtggcctgg
31
+ Homosap TRAV20 TRAV20*01 AE000660 F gaagaccaggtgacgcagagtcccgaggccctgagactccaggagggagagagtagcagtcttaactgcagttacacagtcagcggtttaagagggctgttctggtataggcaagatcctgggaaaggccctgaattcctcttcaccctgtattcagctggggaagaaaaggagaaagaaaggctaaaagccacattaacaaagaaggaaagctttctgcacatcacagcccctaaacctgaagactcagccacttatctctgtgctgtgcagg
32
+ Homosap TRAV20 TRAV20*02 IMGT000024 F gaagaccaggtgacgcagagtcccgaggccctgagactccaggagggagagagtagcagtctcaactgcagttacacagtcagcggtttaagagggctgttctggtataggcaagatcctgggaaaggccctgaattcctcttcaccctgtattcagctggggaagaaaaggagaaagaaaggctaaaagccacattaacaaagaaggaaagctttctgcacatcacagcccctaaacctgaagactcagccacttatctctgtgctgtgcagg
33
+ Homosap TRAV20 TRAV20*03 S60789 (F) gaagaccaggtgacgcagagtcccgaggccctgagactccaggagggagagagtcgcagtctcaactgcagttacacagtcagcggtttaagagggctgttctggtataggcaagatcctgggaaaggccctgaattcctcttcaccctgtattcagctggggaagaaaaggagaaagaaaggctaaaagccacattaacaaagaaggaaagctttctgcacatcacagcccctaaacctgaagactcagccacttatctctgt
34
+ Homosap TRAV20 TRAV20*04 X70305 (F) gaagaccaggtgacgcagagtcccgaggccctgagactccaggagggagagagtagcagtctcaactgcagttgcacagtcagcggtttaagagggctgttctggtataggcaagatcctgggaaaggccctgaattcctcttcaccctgtattcagctggggaagaaaaggagaaagaaaggctaaaagccacattaacaaagaaggaaagctttctgcacatcacagcccctaaacctgaagactcagccacttatctctgtgct
35
+ Homosap TRAV21 TRAV21*01 AE000660 F aaacaggaggtgacgcagattcctgcagctctgagtgtcccagaaggagaaaacttggttctcaactgcagtttcactgatagcgctatttacaacctccagtggtttaggcaggaccctgggaaaggtctcacatctctgttgcttattcagtcaagtcagagagagcaaacaagtggaagacttaatgcctcgctggataaatcatcaggacgtagtactttatacattgcagcttctcagcctggtgactcagccacctacctctgtgctgtgagg
36
+ Homosap TRAV21 TRAV21*02 X58736 (F) aaacaggaggtgacacagattcctgcagctctgagtgtcccagaaggagaaaacttggttctcaactgcagtttcactgatagcgctatttacaacctccagtggtttaggcaggaccctgggaaaggtctcacatctctgttgcttattcagtcaagtcagagagagcaaacaagtggaagacttaatgcctcgctggataaatcatcaggacgtagtactttatacattgcagcttctcagcctggtgactcagccacctacctctgtgct
37
+ Homosap TRAV22 TRAV22*01 AE000660 F ggaatacaagtggagcagagtcctccagacctgattctccaggagggagccaattccacgctgcggtgcaatttttctgactctgtgaacaatttgcagtggtttcatcaaaacccttggggacagctcatcaacctgttttacattccctcagggacaaaacagaatggaagattaagcgccacgactgtcgctacggaacgctacagcttattgtacatttcctcttcccagaccacagactcaggcgtttatttctgtgctgtggagc
38
+ Homosap TRAV23/DV6 TRAV23/DV6*01 AE000660 F cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggatttcaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccaacaattccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcattgcatatcatggattcccagcctggagactcagccacctacttctgtgcagcaagca
39
+ Homosap TRAV23/DV6 TRAV23/DV6*02 M17660 (F) cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggattccaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccaacaattccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcattgcatatcatggattcccagcctggagactcagccacctacttctgtgcagcaagcg
40
+ Homosap TRAV23/DV6 TRAV23/DV6*03 M97704 (F) cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggatttcaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccaacagttccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcattgcatatcatggattcccagcctggagactcagccacctacttctgtgcagcaagca
41
+ Homosap TRAV23/DV6 TRAV23/DV6*04 Y10411 [F] cagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggatttcaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccagcaattccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcattgcatatcatggattcccagcctggagactcagccacctacttctgt
42
+ Homosap TRAV23/DV6 TRAV23/DV6*05 IMGT000024 F cagcagcaggtgaaacaaagtcctcaatctttgatagtccagaaaggagggatttcaattataaactgtgcttatgagaacactgcgtttgactactttccatggtaccaacaattccctgggaaaggccctgcattattgatagccatacgtccagatgtgagtgaaaagaaagaaggaagattcacaatctccttcaataaaagtgccaagcagttctcatcgcatatcatggattcccagcctggagactcagccacctacttctgtgcagcaagca
43
+ Homosap TRAV24 TRAV24*01 AE000660 F atactgaacgtggaacaaagtcctcagtcactgcatgttcaggagggagacagcaccaatttcacctgcagcttcccttccagcaatttttatgccttacactggtacagatgggaaactgcaaaaagccccgaggccttgtttgtaatgactttaaatggggatgaaaagaagaaaggacgaataagtgccactcttaataccaaggagggttacagctatttgtacatcaaaggatcccagcctgaagactcagccacatacctctgtgccttta
44
+ Homosap TRAV24 TRAV24*02 M17661 (F) atactgaacgtggaacaaggtcctcagtcactgcatgttcaggagggagacagcaccaatttcacctgcagcttcccttccagcaatttttatgccttacactggtacagatgggaaactgccaaaacacccgaggccttgtttgtaatgactttaaatggggatgaaaagaagaaaggacgaataagtgccactcttaataccaaggagggttacagctatttgtacatcaaaggatcccagcctgaagattcagccacatacctctgtgccttta
45
+ Homosap TRAV25 TRAV25*01 AE000660 F ggacaacaggtaatgcaaattcctcagtaccagcatgtacaagaaggagaggacttcaccacgtactgcaattcctcaactactttaagcaatatacagtggtataagcaaaggcctggtggacatcccgtttttttgatacagttagtgaagagtggagaagtgaagaagcagaaaagactgacatttcagtttggagaagcaaaaaagaacagctccctgcacatcacagccacccagactacagatgtaggaacctacttctgtgcaggg
46
+ Homosap TRAV26-1 TRAV26-1*01 AE000660 F gatgctaagaccacccagcccccctccatggattgcgctgaaggaagagctgcaaacctgccttgtaatcactctaccatcagtggaaatgagtatgtgtattggtatcgacagattcactcccaggggccacagtatatcattcatggtctaaaaaacaatgaaaccaatgaaatggcctctctgatcatcacagaagacagaaagtccagcaccttgatcctgccccacgctacgctgagagacactgctgtgtactattgcatcgtcagagtcg
47
+ Homosap TRAV26-1 TRAV26-1*02 IMGT000024 F gatgctaagaccacccagcccacctccatggattgcgctgaaggaagagctgcaaacctgccttgtaatcactctaccatcagtggaaatgagtatgtgtattggtatcgacagattcactcccaggggccacagtatatcattcatggtctaaaaaacaatgaaaccaatgaaatggcctctctgatcatcacagaagacagaaagtccagcaccttgatcctgccccacgctacgctgagagacactgctgtgtactattgcatcgtcagagtcg
48
+ Homosap TRAV26-1 TRAV26-1*03 L06886 (F) gatgctaagaccacccagcccccctccatggattgcgctgaaggaagagctgcaaacctgccttgtaatcactctaccatcagtggaaatgagtatgtgtattggtatcgacagattcactcccaggggccacagaatatcattcatggtctaaaaaacaatgaaaccaatgaaatggcctctctgatcatcacagaagacagaaagtccagcaccttgatcctgccccacgctacgctgagagacactgctgtgtactattgcatc
49
+ Homosap TRAV26-2 TRAV26-2*01 AE000660 F gatgctaagaccacacagccaaattcaatggagagtaacgaagaagagcctgttcacttgccttgtaaccactccacaatcagtggaactgattacatacattggtatcgacagcttccctcccagggtccagagtacgtgattcatggtcttacaagcaatgtgaacaacagaatggcctctctggcaatcgctgaagacagaaagtccagtaccttgatcctgcaccgtgctaccttgagagatgctgctgtgtactactgcatcctgagagac
50
+ Homosap TRAV26-2 TRAV26-2*02 L11160 [F] gatgctaagaccacacagccaaattcaatggagagtaacgaagaagagcctgttcacttgccttgtaaccactccacaatcagtggaactgattacatacattggtatcgacagcttccctcccagggtccagagtacgtgattcatggtcttacaagcaatgtgaacaacagaatggcctgtgtggcaatcgctgaagacagaaagtccagtacct
51
+ Homosap TRAV27 TRAV27*01 AE000660 F acccagctgctggagcagagccctcagtttctaagcatccaagagggagaaaatctcactgtgtactgcaactcctcaagtgttttttccagcttacaatggtacagacaggagcctggggaaggtcctgtcctcctggtgacagtagttacgggtggagaagtgaagaagctgaagagactaacctttcagtttggtgatgcaagaaaggacagttctctccacatcactgcagcccagcctggtgatacaggcctctacctctgtgcaggag
52
+ Homosap TRAV27 TRAV27*02 X04957 (F) acccagctgctggagcagagccctcagtttctaagcatccaagagggagaaaatctcactgtgtactgcaactcctcaagtgttttttccagcttacaatggtacaggcaggagcctggggaaggtcctgtcctcctggtgacagtagttacgggtggagaagtgaagaagctgaagagactaacctttcagtttggtgatgcaagaaaggacagttctctccacatcactgcggcccagcctggtgatacaggccactacctctgtgcagg
53
+ Homosap TRAV27 TRAV27*03 IMGT000024 F acccagctgctggagcagagccctcagtttctaagcatccaagagggagaaaatctcactgtgtactgcaactcctcaagtgttttttccagcttacaatggtacagacaggagcctggggaaggtcctgtcctcctggtgacagtagttacgggtggagaagtgaagaagctgaagagactaacctttcagtttggtgatgcaagaaaggacagttctctccacatcactgcagcccagactggtgatacaggcctctacctctgtgcaggag
54
+ Homosap TRAV29/DV5 TRAV29/DV5*01 AE000660 F gaccagcaagttaagcaaaattcaccatccctgagcgtccaggaaggaagaatttctattctgaactgtgactatactaacagcatgtttgattatttcctatggtacaaaaaataccctgctgaaggtcctacattcctgatatctataagttccattaaggataaaaatgaagatggaagattcactgtcttcttaaacaaaagtgccaagcacctctctctgcacattgtgccctcccagcctggagactctgcagtgtacttctgtgcagcaagcg
55
+ Homosap TRAV29/DV5 TRAV29/DV5*02 S81645 F gaccagcaagttaagcaaaattcaccatccctgagcgtccaggaaggaagaatttctattctgaactgtgactatactaacagcatgtttgattatttcctatggtacaaaaaataccctgctgaaggtcctacattcctgatatctataagttccattaaggataaaaatgaagatggaagattcactgttttcttaaacaaaagtgccaagcacctctctctcgacattgtgccctcccagcctggagactctgcagtgtacttctgtgcagcaagc
56
+ Homosap TRAV29/DV5 TRAV29/DV5*04 IMGT000024 F gaccagcaagttaagcaaaattcaccatccctgagcgtccaggaaggaagaatttctattctgaactgtgactatactaacagcatgtttgattatttcctatggtacaaaaaataccctgctgaaggtcctacattcctgatatctataagttccattaaggataaaaatgaagatggaagattcactgttttcttaaacaaaagtgccaagcacctctctctgcacattgtgccctcccagcctggagactctgcagtgtacttctgtgcagcaagcg
57
+ Homosap TRAV3 TRAV3*01 AE000658 F gctcagtcagtggctcagccggaagatcaggtcaacgttgctgaagggaatcctctgactgtgaaatgcacctattcagtctctggaaacccttatcttttttggtatgttcaataccccaaccgaggcctccagttccttctgaaatacatcacaggggataacctggttaaaggcagctatggctttgaagctgaatttaacaagagccaaacctccttccacctgaagaaaccatctgcccttgtgagcgactccgctttgtacttctgtgctgtgagagaca
58
+ Homosap TRAV30 TRAV30*01 AE000660 F caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcatcaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagaagggtcatgaaaaaatatctgcttcatttaatgaaaaaaagcagcaaagctccctgtaccttacggcctcccagctcagttactcaggaacctacttctgcggcacagaga
59
+ Homosap TRAV30 TRAV30*02 X58768 (F) caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcaccaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagatgcgtcgtgaaaaaatatctgcttcatttaatgaaaaaaagcagcaaagctccctgtaccttacggcctcccagctcagttactcaggaacctacttctgcggg
60
+ Homosap TRAV30 TRAV30*03 L06883 (F) caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcatcaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagaagggtcatgaaaaaatatctgcttcatttaatgaaaaaaagcggcaaagctccctgtaccttacggcctcccagctcagttactcaggaacctacttctgcggc
61
+ Homosap TRAV30 TRAV30*04 U32537 [F] caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcatcaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagaagcgtcatgaaaaaatatctgcttcatttaatgaaaaaaagcagcaaagctccctgtaccttacggc
62
+ Homosap TRAV30 TRAV30*05 IMGT000024 F caacaaccagtgcagagtcctcaagccgtgatcctccgagaaggggaagatgctgtcatcaactgcagttcctccaaggctttatattctgtacactggtacaggcagaagcatggtgaagcacccgtcttcctgatgatattactgaagggtggagaacagaagggtcatgacaaaatatctgcttcatttaatgaaaaaaagcagcaaagctccctgtaccttacggcctcccagctcagttactcaggaacctacttctgcggcacagaga
63
+ Homosap TRAV34 TRAV34*01 AE000660 F agccaagaactggagcagagtcctcagtccttgatcgtccaagagggaaagaatctcaccataaactgcacgtcatcaaagacgttatatggcttatactggtataagcaaaagtatggtgaaggtcttatcttcttgatgatgctacagaaaggtggggaagagaaaagtcatgaaaagataactgccaagttggatgagaaaaagcagcaaagttccctgcatatcacagcctcccagcccagccatgcaggcatctacctctgtggagcagaca
64
+ Homosap TRAV35 TRAV35*01 AE000660 F ggtcaacagctgaatcagagtcctcaatctatgtttatccaggaaggagaagatgtctccatgaactgcacttcttcaagcatatttaacacctggctatggtacaagcaggaacctggggaaggtcctgtcctcttgatagccttatataaggctggtgaattgacctcaaatggaagactgactgctcagtttggtataaccagaaaggacagcttcctgaatatctcagcatccatacctagtgatgtaggcatctacttctgtgctgggcag
65
+ Homosap TRAV35 TRAV35*02 X58738 (F) ggtcaacagctgaatcagagtcctcaatctatgtttatccaggaaggagaagatgtctccatgaactgcacttcttcaagcatatttaacacctggctatggtacaagcaggaccctggggaaggtcctgtcctcttgatagccttatataaggctggtgaattgacctcaaatggaagactgactgctcagtttggtataaccagaaaggacagcttcctgaatatctcagcatccatacctagtgatgtaggcatctacttctgtgct
66
+ Homosap TRAV36/DV7 TRAV36/DV7*01 AE000660 F gaagacaaggtggtacaaagccctctatctctggttgtccacgagggagacaccgtaactctcaattgcagttatgaagtgactaactttcgaagcctactatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaactttccagcatcctgaacatcacagccacccagaccggagactcggccatctacctctgtgctgtggagg
67
+ Homosap TRAV36/DV7 TRAV36/DV7*02 X61070 (F) gaagacaaggtggtacaaagccctcaatctctggttgtccacgagggagacactgtaactctcaattgcagttatgaaatgactaactttcgaagcctacaatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaacttttcagcatcctgaacatcacagccacccagaccggagactcggccgtctacctctgtgctgtgg
68
+ Homosap TRAV36/DV7 TRAV36/DV7*03 X58767 (F) gaagacaaggtggtacaaagccctctatctctggttgtccacgagggagacactgtaactcccaattgcagttatgaagtgactaactttcgaagcctactatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaacttttcagcatcctgaacatcacagccacccagaccggagactcggccgtctacctctgtgct
69
+ Homosap TRAV36/DV7 TRAV36/DV7*04 Z46643 (F) gaagacaaggtggtacaaagccctctatctctggttgtccacgagggagacactgtaactctcaattgcagttatgaagtgactaactttcgaagcctactatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaacttttcagcatcctgaacatcacagccacccagaccggagactcggccgtctacctctgtgctg
70
+ Homosap TRAV36/DV7 TRAV36/DV7*05 IMGT000024 F gaagacaaggtggtacaaagccctctatctctggttgtccacgagggagacaccgtaactctcaattgcagttatgaagtgactaactttcgaagcctactatggtacaagcaggaaaagaaagctcccacatttctatttatgctaacttcaagtggaattgaaaagaagtcaggaagactaagtagcatattagataagaaagaacttttcagcatcctgaacatcacagccacccagaccggagactcggccatctacctctgtgctgtggagg
71
+ Homosap TRAV38-1 TRAV38-1*01 AE000661 F gcccagacagtcactcagtctcaaccagagatgtctgtgcaggaggcagagactgtgaccctgagttgcacatatgacaccagtgagaataattattatttgttctggtacaagcagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacggagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggacactgcgatgtatttctgtgctttcatgaagca
72
+ Homosap TRAV38-1 TRAV38-1*02 M64355 (F) gcccagacagtcactcagtctcaaccagagatgtctgtgcaggaggcagagactgtgaccctgagttgcacatatgacaccagtgagaatgattattatttgttctggtacaagcagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacggagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggacactgcgatgtatttctgtgctt
73
+ Homosap TRAV38-1 TRAV38-1*03 M95394 (F) gcccagacagtcactcagtctcaaccagagatgtctgtgcaggaggcagagactgtgaccctgagttgcacatatgacaccagtgagagtaattattatttgttctggtacaaacagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacggagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggacactgcgatgtatttctgtgctttca
74
+ Homosap TRAV38-1 TRAV38-1*04 L06880 (F) gcccagacagtcactcagtcccagccagagatgtctgtgcaggaggcagagactgtgaccctgagttgcacatatgacaccagtgagaataattattatttgttctggtacaagcagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacggagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggacactgcgatgtatttctgtgca
75
+ Homosap TRAV38-2/DV8 TRAV38-2/DV8*01 AE000661 F gctcagacagtcactcagtctcaaccagagatgtctgtgcaggaggcagagaccgtgaccctgagctgcacatatgacaccagtgagagtgattattatttattctggtacaagcagcctcccagcaggcagatgattctcgttattcgccaagaagcttataagcaacagaatgcaacagagaatcgtttctctgtgaacttccagaaagcagccaaatccttcagtctcaagatctcagactcacagctgggggatgccgcgatgtatttctgtgcttataggagcg
76
+ Homosap TRAV39 TRAV39*01 AE000661 F gagctgaaagtggaacaaaaccctctgttcctgagcatgcaggagggaaaaaactataccatctactgcaattattcaaccacttcagacagactgtattggtacaggcaggatcctgggaaaagtctggaatctctgtttgtgttgctatcaaatggagcagtgaagcaggagggacgattaatggcctcacttgataccaaagcccgtctcagcaccctccacatcacagctgccgtgcatgacctctctgccacctacttctgtgccgtggaca
77
+ Homosap TRAV4 TRAV4*01 AE000658 F cttgctaagaccacccagcccatctccatggactcatatgaaggacaagaagtgaacataacctgtagccacaacaacattgctacaaatgattatatcacgtggtaccaacagtttcccagccaaggaccacgatttattattcaaggatacaagacaaaagttacaaacgaagtggcctccctgtttatccctgccgacagaaagtccagcactctgagcctgccccgggtttccctgagcgacactgctgtgtactactgcctcgtgggtgaca
78
+ Homosap TRAV40 TRAV40*01 X73521 F agcaattcagtcaagcagacgggccaaataaccgtctcggagggagcatctgtgactatgaactgcacatacacatccacggggtaccctacccttttctggtatgtggaataccccagcaaacctctgcagcttcttcagagagagacaatggaaaacagcaaaaacttcggaggcggaaatattaaagacaaaaactcccccattgtgaaatattcagtccaggtatcagactcagccgtgtactactgtcttctgggaga
79
+ Homosap TRAV41 TRAV41*01 AE000661 F aaaaatgaagtggagcagagtcctcagaacctgactgcccaggaaggagaatttatcacaatcaactgcagttactcggtaggaataagtgccttacactggctgcaacagcatccaggaggaggcattgtttccttgtttatgctgagctcagggaagaagaagcatggaagattaattgccacaataaacatacaggaaaagcacagctccctgcacatcacagcctcccatcccagagactctgccgtctacatctgtgctgtcaga
80
+ Homosap TRAV5 TRAV5*01 AE000659 F ggagaggatgtggagcagagtcttttcctgagtgtccgagagggagacagctccgttataaactgcacttacacagacagctcctccacctacttatactggtataagcaagaacctggagcaggtctccagttgctgacgtatattttttcaaatatggacatgaaacaagaccaaagactcactgttctattgaataaaaaggataaacatctgtctctgcgcattgcagacacccagactggggactcagctatctacttctgtgcagagagta
81
+ Homosap TRAV6 TRAV6*01 AE000659 F agccaaaagatagaacagaattccgaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattccccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgctctagaca
82
+ Homosap TRAV6 TRAV6*02 X58747 (F) agccaaaagatagaacagaattccgaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgct
83
+ Homosap TRAV6 TRAV6*03 Z49060 [F] gaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctacttatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgct
84
+ Homosap TRAV6 TRAV6*04 Y10409 [F] gaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatgtcacagcctcccagcctgcagactcagctacctacctctgtgct
85
+ Homosap TRAV6 TRAV6*05 Y10410 [F] gaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacgaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgct
86
+ Homosap TRAV6 TRAV6*06 U32542 [F] agccaaaagatagaacagaattccgaggccctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaccaga
87
+ Homosap TRAV6 TRAV6*07 IMGT000024 F agccaaaagatagaacagaattccgaggctctgaacattcaggagggtaaaacggccaccctgacctgcaactatacaaactattctccagcatacttacagtggtaccgacaagatccaggaagaggccctgttttcttgctactcatacgtgaaaatgagaaagaaaaaaggaaagaaagactgaaggtcacctttgataccacccttaaacagagtttgtttcatatcacagcctcccagcctgcagactcagctacctacctctgtgctctagaca
88
+ Homosap TRAV7 TRAV7*01 AE000659 F gaaaaccaggtggagcacagccctcattttctgggaccccagcagggagacgttgcctccatgagctgcacgtactctgtcagtcgttttaacaatttgcagtggtacaggcaaaatacagggatgggtcccaaacacctattatccatgtattcagctggatatgagaagcagaaaggaagactaaatgctacattactgaagaatggaagcagcttgtacattacagccgtgcagcctgaagattcagccacctatttctgtgctgtagatg
89
+ Homosap TRAV8-1 TRAV8-1*01 AE000659 F gcccagtctgtgagccagcataaccaccacgtaattctctctgaagcagcctcactggagttgggatgcaactattcctatggtggaactgttaatctcttctggtatgtccagtaccctggtcaacaccttcagcttctcctcaagtacttttcaggggatccactggttaaaggcatcaagggctttgaggctgaatttataaagagtaaattctcctttaatctgaggaaaccctctgtgcagtggagtgacacagctgagtacttctgtgccgtgaatgc
90
+ Homosap TRAV8-1 TRAV8-1*02 U32520 [F] gcccagtctgtgagccagcataaccaccacgtaattctctctgaagcagcctcactggagttgggatgcaactattcctatggtggaactgttaatctcttctggtatgtccagtaccctggtcaacaccttcagcttctcctcaagtacttttcaggggatccactggttaaaggcatcaagggcgttgaggctgaatttataaagagtaaattctcctttaatctgaggaaaccctctgtgcagtgga
91
+ Homosap TRAV8-2 TRAV8-2*01 AE000659 F gcccagtcggtgacccagcttgacagccacgtctctgtctctgaaggaaccccggtgctgctgaggtgcaactactcatcttcttattcaccatctctcttctggtatgtgcaacaccccaacaaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgttgtgagtga
92
+ Homosap TRAV8-2 TRAV8-2*02 M17650 (F) gcccagtcggtgacccagcttagcagccacgtctctgtctctgaaggaaccccggtgctgctgaggtgcaactactcatcttcttattcaccatctctcttctggtatgtgcaacaccccaacaaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgttgtga
93
+ Homosap TRAV8-2 TRAV8-2*03 IMGT000024 F gcccagtcggtgacccagcttgacagccacgtctctgtctctgaaggaaccccggtgctgctgaggtgcaactactcatcttcttattcaccgtctctcttctggtatgtgcaacaccccaacaaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgttgtgagtga
94
+ Homosap TRAV8-3 TRAV8-3*01 AE000659 F gcccagtcagtgacccagcctgacatccacatcactgtctctgaaggagcctcactggagttgagatgtaactattcctatggggcaacaccttatctcttctggtatgtccagtcccccggccaaggcctccagctgctcctgaagtacttttcaggagacactctggttcaaggcattaaaggctttgaggctgaatttaagaggagtcaatcttccttcaatctgaggaaaccctctgtgcattggagtgatgctgctgagtacttctgtgctgtgggtgc
95
+ Homosap TRAV8-3 TRAV8-3*02 M35617 (F) gcccagtcagtgacccagcctgacatccacatcactgtctctgaaggagcctcactggagttgagatgtaactattcctatggggcaacaccttatctcttctggtatgtccagtcccccggccaaggcctccagctgctcctgaagtacttttcaggagacactctggttcaaggcattaaaggctttgaggctgaatttaagaggagtcaatcttccttcaacctgaggaaaccctctgtgcattggagtgatgctgctgagtacttctgtgctgtggtt
96
+ Homosap TRAV8-3 TRAV8-3*03 L06885 (F) gcccagtcagtgacccagcctgacatccacatcactgtctctgaaggagcctcactggagttgagatgtaactattcctatggggcaacaccttatctcttctggtatgtccagtcccccggccaaggcctccagctgctcctgaagtacttttcaggagacactctggttcaaggtattaaaggctttgaggctgaatttaagaggagtcaatcttccttcaatctgaggaaaccctctgtgcattggagtgatgcgtctgagtacttctgtgct
97
+ Homosap TRAV8-4 TRAV8-4*01 AE000659 F gcccagtcggtgacccagcttggcagccacgtctctgtctctgaaggagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
98
+ Homosap TRAV8-4 TRAV8-4*02 M12423 (F) gcccagtcggtgacccagcttggcagccacgtctctgtctctgaaggagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacaaaaccctcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
99
+ Homosap TRAV8-4 TRAV8-4*03 D13077 (F) gcccagtcggtgacccagcttggcagccacgtctctgtctctgagggagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacaacaggggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgct
100
+ Homosap TRAV8-4 TRAV8-4*04 M12959 (F) gcccagtcggtgacccagcttggcagccacgtctctgtctctgaacgagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
101
+ Homosap TRAV8-4 TRAV8-4*05 X63455 (F) gcccagtcggtgacccagcttggcagccacgtctctgtctctgaaggagccctggttctgctgaggtgcaactactcatcgtctgttccaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggaatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaaccctcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
102
+ Homosap TRAV8-4 TRAV8-4*06 K02777 (F) ggtgcaactcactactgctgtccaccaatactcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacatcagcggccaccctggttaaaggcatcaacggttttgaggctgaatttaagaagagtgaaacctccttccacctgacgaaacccgcagcccatatgagcgacgcggctgagtacttctgtgctgtgagtga
103
+ Homosap TRAV8-4 TRAV8-4*07 M17665 (F) gttgaaccatatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtacacaacaggggccaccctggttaaaggcatcaacggttttgaggctgaatttaaaaagagtgaaacctccttccacctgacgaaaccctcagcccatatgaccgacccggctgagtacttctgtgctgtgag
104
+ Homosap TRAV8-6 TRAV8-6*01 X02850 F gcccagtctgtgacccagcttgacagccaagtccctgtctttgaagaagcccctgtggagctgaggtgcaactactcatcgtctgtttcagtgtatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtatttatcaggatccaccctggttgaaagcatcaacggttttgaggctgaatttaacaagagtcaaacttccttccacttgaggaaaccctcagtccatataagcgacacggctgagtacttctgtgctgtgagtga
105
+ Homosap TRAV8-6 TRAV8-6*02 AE000659 F gcccagtctgtgacccagcttgacagccaagtccctgtctttgaagaagcccctgtggagctgaggtgcaactactcatcgtctgtttcagtgtatctcttctggtatgtgcaataccccaaccaaggactccagcttctcctgaagtatttatcaggatccaccctggttaaaggcatcaacggttttgaggctgaatttaacaagagtcaaacttccttccacttgaggaaaccctcagtccatataagcgacacggctgagtacttctgtgctgtgagtga
106
+ Homosap TRAV9-1 TRAV9-1*01 AE000659 F ggagattcagtggtccagacagaaggccaagtgctcccctctgaaggggattccctgattgtgaactgctcctatgaaaccacacagtacccttcccttttttggtatgtccaatatcctggagaaggtccacagctccacctgaaagccatgaaggccaatgacaagggaaggaacaaaggttttgaagccatgtaccgtaaagaaaccacttctttccacttggagaaagactcagttcaagagtcagactccgctgtgtacttctgtgctctgagtga
107
+ Homosap TRAV9-2 TRAV9-2*01 AE000659 F ggaaattcagtgacccagatggaagggccagtgactctctcagaagaggccttcctgactataaactgcacgtacacagccacaggatacccttcccttttctggtatgtccaatatcctggagaaggtctacagctcctcctgaaagccacgaaggctgatgacaagggaagcaacaaaggttttgaagccacataccgtaaagaaaccacttctttccacttggagaaaggctcagttcaagtgtcagactcagcggtgtacttctgtgctctgagtga
108
+ Homosap TRAV9-2 TRAV9-2*02 IMGT000024 F ggagattcagtgacccagatggaagggccagtgactctctcagaagaggccttcctgactataaactgcacgtacacagccacaggatacccttcccttttctggtatgtccaatatcctggagaaggtctacagctcctcctgaaagccacgaaggctgatgacaagggaagcaacaaaggttttgaagccacataccgtaaagaaaccacttctttccacttggagaaaggctcagttcaagtgtcagactcagcggtgtacttctgtgctctgagtga
109
+ Homosap TRAV9-2 TRAV9-2*03 L06881 (F) ggagattcagtgacccagatggaagggccagtgactctctcagaagaggccttcctgactataaactgcacgtacacagccacaggatacccttcccttttctggtatgtccaatatcctggagaaggtctacagctcctcctgaaagccacgaaggctgatgacaagggaagcaacaaaggttttgaagccacataccgtaaggaaaccacttctttccacttggagaaaggctcagttcaagtgtcagactcagcggtgtacttctgtgct
110
+ Homosap TRAV9-2 TRAV9-2*04 L06882 (F) ggaaattcagtgacccagatggaagggccagtgactctctcagaagaggccttcctgactataaactgcacgtacacagccacaggatacccttcccttttctggtatgtccaatatcctggagaaggtctacagctcctcctgaaagccacgaaggctgatgacaagggaagcaacaaaggttttgaagccacataccgtaaggaaaccacttctttccacttggagaaaggctcagttcaagtgtcagactcagcggtgtacttctgtgct
src/library/trbjs_aa.tsv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Species Gene Allele AccNum Functionality aa_seq
2
+ Homosap TRBJ1-1 TRBJ1-1*01 K02545 F NTEAFFGQGTRLTVV
3
+ Homosap TRBJ1-2 TRBJ1-2*01 K02545 F NYGYTFGSGTRLTVV
4
+ Homosap TRBJ1-3 TRBJ1-3*01 M14158 F SGNTIYFGEGSWLTVV
5
+ Homosap TRBJ1-4 TRBJ1-4*01 M14158 F TNEKLFFGSGTQLSVL
6
+ Homosap TRBJ1-5 TRBJ1-5*01 M14158 F SNQPQHFGDGTRLSIL
7
+ Homosap TRBJ1-6 TRBJ1-6*01 M14158 F SYNSPLHFGNGTRLTVT
8
+ Homosap TRBJ1-6 TRBJ1-6*02 L36092 F SYNSPLHFGNGTRLTVT
9
+ Homosap TRBJ2-1 TRBJ2-1*01 X02987 F SYNEQFFGPGTRLTVL
10
+ Homosap TRBJ2-2 TRBJ2-2*01 X02987 F NTGELFFGEGSRLTVL
11
+ Homosap TRBJ2-3 TRBJ2-3*01 X02987 F STDTQYFGPGTRLTVL
12
+ Homosap TRBJ2-4 TRBJ2-4*01 X02987 F AKNIQYFGAGTRLSVL
13
+ Homosap TRBJ2-5 TRBJ2-5*01 X02987 F QETQYFGPGTRLLVL
14
+ Homosap TRBJ2-6 TRBJ2-6*01 X02987 F SGANVLTFGAGSRLTVL
15
+ Homosap TRBJ2-7 TRBJ2-7*01 M14159 F SYEQYFGPGTRLTVT
src/library/trbjs_nt.tsv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Species Gene Allele AccNum Functionality nt_seq
2
+ Homosap TRBJ1-1 TRBJ1-1*01 K02545 F tgaacactgaagctttctttggacaaggcaccagactcacagttgtag
3
+ Homosap TRBJ1-2 TRBJ1-2*01 K02545 F ctaactatggctacaccttcggttcggggaccaggttaaccgttgtag
4
+ Homosap TRBJ1-3 TRBJ1-3*01 M14158 F ctctggaaacaccatatattttggagagggaagttggctcactgttgtag
5
+ Homosap TRBJ1-4 TRBJ1-4*01 M14158 F caactaatgaaaaactgttttttggcagtggaacccagctctctgtcttgg
6
+ Homosap TRBJ1-5 TRBJ1-5*01 M14158 F tagcaatcagccccagcattttggtgatgggactcgactctccatcctag
7
+ Homosap TRBJ1-6 TRBJ1-6*01 M14158 F ctcctataattcacccctccactttgggaatgggaccaggctcactgtgacag
8
+ Homosap TRBJ1-6 TRBJ1-6*02 L36092 F ctcctataattcacccctccactttgggaacgggaccaggctcactgtgacag
9
+ Homosap TRBJ2-1 TRBJ2-1*01 X02987 F ctcctacaatgagcagttcttcgggccagggacacggctcaccgtgctag
10
+ Homosap TRBJ2-2 TRBJ2-2*01 X02987 F cgaacaccggggagctgttttttggagaaggctctaggctgaccgtactgg
11
+ Homosap TRBJ2-3 TRBJ2-3*01 X02987 F agcacagatacgcagtattttggcccaggcacccggctgacagtgctcg
12
+ Homosap TRBJ2-4 TRBJ2-4*01 X02987 F agccaaaaacattcagtacttcggcgccgggacccggctctcagtgctgg
13
+ Homosap TRBJ2-5 TRBJ2-5*01 X02987 F accaagagacccagtacttcgggccaggcacgcggctcctggtgctcg
14
+ Homosap TRBJ2-6 TRBJ2-6*01 X02987 F ctctggggccaacgtcctgactttcggggccggcagcaggctgaccgtgctgg
15
+ Homosap TRBJ2-7 TRBJ2-7*01 M14159 F ctcctacgagcagtacttcgggccgggcaccaggctcacggtcacag
src/library/trbvs_aa.tsv ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Species Gene Allele AccNum Functionality aa_seq
2
+ Homosap TRBV10-1 TRBV10-1*01 L36092 F DAEITQSPRHKITETGRQVTLACHQTWNHNNMFWYRQDLGHGLRLIHYSYGVQDTNKGEVSDGYSVSRSNTEDLPLTLESAASSQTSVYFCASSE
3
+ Homosap TRBV10-1 TRBV10-1*02 AF009660 F DAEITQSPRHKITETGRQVTLACHQTWNHNNMFWYRQDLGHGLRLIHYSYGVHDTNKGEVSDGYSVSRSNTEDLPLTLESAASSQTSVYFCASSE
4
+ Homosap TRBV10-2 TRBV10-2*01 L36092 F DAGITQSPRYKITETGRQVTLMCHQTWSHSYMFWYRQDLGHGLRLIYYSAAADITDKGEVPDGYVVSRSKTENFPLTLESATRSQTSVYFCASSE
5
+ Homosap TRBV10-2 TRBV10-2*02 IMGT000021 F DAGITQSPRYKITETGRQVTLMCHQTWSHSYMFWYRQDLGHGLRLIYYSAAADITDKGEVPDGYVVSRSKTENFPLTLESATRSQTSVYFCASSE
6
+ Homosap TRBV10-3 TRBV10-3*01 U03115 F DAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGHGLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFCAISE
7
+ Homosap TRBV10-3 TRBV10-3*02 U17047 F DAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGHGLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFCAISE
8
+ Homosap TRBV10-3 TRBV10-3*03 L33101 [F] DAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGHGLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFC
9
+ Homosap TRBV10-3 TRBV10-3*04 L33102 [F] DAGITQSPRHKVTETGTPVTLRCHQTENHRYMYWYRQDPGHGLRLIHYSYGVKDTDKGEVSDGYSVSRSKTEDFLLTLESATSSQTSVYFC
10
+ Homosap TRBV11-1 TRBV11-1*01 M33233 F EAEVAQSPRYKITEKSQAVAFWCDPISGHATLYWYRQILGQGPELLVQFQDESVVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAMYLCASSL
11
+ Homosap TRBV11-2 TRBV11-2*01 L36092 F EAGVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQFQNNGVVDDSQLPKDRFSAERLKGVDSTLKIQPAKLEDSAVYLCASSL
12
+ Homosap TRBV11-2 TRBV11-2*02 M33235 [F] EAGVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQFQNNGVVDDSQLPKDRFSAERLKGVDSTLKIQPAKLENSAVYLCASS
13
+ Homosap TRBV11-2 TRBV11-2*03 IMGT000021 F EAGVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQFQNNGVVDDSQLPKDRFSAERLKGVDSTLKIQPAKLEDSAVYLCASSL
14
+ Homosap TRBV11-3 TRBV11-3*01 U03115 F EAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYLQNLGQGPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASSL
15
+ Homosap TRBV11-3 TRBV11-3*02 X58797 (F) EAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYRQNLGQGPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASS
16
+ Homosap TRBV11-3 TRBV11-3*04 AB305924 (F) EAGVVQSPRYKIIEKKQPVAFWCNPISGHNTLYWYRQNLGQGPELLIRYENEEAVDDSQLPKDRFSAERLKGVDSTLKIQPAELGDSAVYLCASSL
17
+ Homosap TRBV12-3 TRBV12-3*01 X07192 F DAGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMRGLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL
18
+ Homosap TRBV12-4 TRBV12-4*01 K02546 F DAGVIQSPRHEVTEMGQEVTLRCKPISGHDYLFWYRQTMMRGLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLKIQPSEPRDSAVYFCASSL
19
+ Homosap TRBV12-4 TRBV12-4*02 M14264 (F) DAGVIQSPRHEVTEMGQEVTLRCKPISGHDYLFWYRQTMMRGLELLIYFNNNVPIDDSGMPEDRFSAKMPNASFSTLRIQPSEPRDSAVYFCASSL
20
+ Homosap TRBV12-5 TRBV12-5*01 X07223 F DARVTQTPRHKVTEMGQEVTMRCQPILGHNTVFWYRQTMMQGLELLAYFRNRAPLDDSGMPKDRFSAEMPDATLATLKIQPSEPRDSAVYFCASGL
21
+ Homosap TRBV13 TRBV13*01 U03115 F AAGVIQSPRHLIKEKRETATLKCYPIPRHDTVYWYQQGPGQDPQFLISFYEKMQSDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYFCASSL
22
+ Homosap TRBV13 TRBV13*02 M62378 (F) AAGVIQSPRHLIREKRETATLKCYPIPRHDTVYWYQQGPGQDPQFFISFYEKMQSDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYFCASS
23
+ Homosap TRBV14 TRBV14*01 X06154 F EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASSQ
24
+ Homosap TRBV14 TRBV14*02 X57722 (F) EAGVTQFPSHSVIEKGQTVTLRCDPISGHDNLYWYRRVMGKEIKFLLHFVKESKQDESGMPNNRFLAERTGGTYSTLKVQPAELEDSGVYFCASS
25
+ Homosap TRBV15 TRBV15*01 U03115 F DAMVIQNPRYQVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQAPKLLFHYYDKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDTAMYLCATSR
26
+ Homosap TRBV15 TRBV15*02 IMGT000021 F DAMVIQNPRYQVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQAPKLLFHYYDKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDAAMYLCATSR
27
+ Homosap TRBV15 TRBV15*03 M62376 (F) DAMVIQNPRYRVTQFGKPVTLSCSQTLNHNVMYWYQQKSSQAPKLLFHYYNKDFNNEADTPDNFQSRRPNTSFCFLDIRSPGLGDAAMYQCATS
28
+ Homosap TRBV16 TRBV16*01 L26231 F GEEVAQTPKHLVRGEGQKAKLYCAPIKGHSYVFWYQQVLKNEFKFLISFQNENVFDETGMPKERFSAKCLPNSPCSLEIQATKLEDSAVYFCASSQ
29
+ Homosap TRBV16 TRBV16*03 L26054 (F) GEEVAQTPKHLVRGEGQKAKLYCAPIKGHSYVFWYQQVLKNEFKFLVSFQNENVFDETGMPKERFSAKCLPNSPCSLEIQATKLEDSAVYFCASS
30
+ Homosap TRBV18 TRBV18*01 L36092 F NAGVMQNPRHLVRRRGQEARLRCSPMKGHSHVYWYRQLPEEGLKFMVYLQKENIIDESGMPKERFSAEFPKEGPSILRIQQVVRGDSAAYFCASSP
31
+ Homosap TRBV19 TRBV19*01 L36092 F DGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQGLRLIYYSQIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCASSI
32
+ Homosap TRBV19 TRBV19*02 U48259 F DGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQVPGQGLRLIYYSHIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCASSI
33
+ Homosap TRBV19 TRBV19*03 M97725 (F) DGGITQSPKYLFRKEGQNVTLSCEQNLNHDAMYWYRQDPGQGLRLIYYSHIVNDFQKGDIAEGYSVSREKKESFPLTVTSAQKNPTAFYLCAS
34
+ Homosap TRBV2 TRBV2*01 L36092 F EPEVTQTPSHQVTQMGQEVILRCVPISNHLYFYWYRQILGQKVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASSE
35
+ Homosap TRBV2 TRBV2*02 M62379 (F) EPEVTQTPSHQVTQMGQEVILHCVPISNHLYFYWYRQILGQKVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASS
36
+ Homosap TRBV2 TRBV2*03 M64351 (F) EPEVTQTPSHQVTQMGQEVILRCVPISNHLYFYWYRQILGQKVEFLVSFYNNEISEKSEIFDDQFSVERPDGSNFTLKIRSTKLEDSAMYFCASSE
37
+ Homosap TRBV20-1 TRBV20-1*01 M11955 F GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR
38
+ Homosap TRBV20-1 TRBV20-1*02 X72719 F GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSA
39
+ Homosap TRBV20-1 TRBV20-1*03 M11954 (F) GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGCKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSA
40
+ Homosap TRBV20-1 TRBV20-1*04 M14263 (F) GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAS
41
+ Homosap TRBV20-1 TRBV20-1*05 X57604 (F) GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR
42
+ Homosap TRBV20-1 TRBV20-1*06 D13088 (F) GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSA
43
+ Homosap TRBV20-1 TRBV20-1*07 X74852 (F) GAVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKKSLMQIATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSAR
44
+ Homosap TRBV20/OR9-2 TRBV20/OR9-2*01 L05149 (F) SAVVSQHPSRVICKSGTSVNIECRSLDFQATTMFWYRQLRKQSLMLMATSNEGSEVTYEQGVKKDKFPINHPNLTFSALTVTSAHPEDSSFYICSAR
45
+ Homosap TRBV20/OR9-2 TRBV20/OR9-2*03 L05149 (F) SAVVSQHPSRVICKSGTSVNIECRSLDFQATTMFWYRQLRKQSLMLMAASNEGSEVTYEQGVKKDKFPINHPNLTFSALTVTSAHPEDSSFYICSAR
46
+ Homosap TRBV21-1 TRBV21-1*01 AF029308 (F) DTKVTQRPRFLVKANEQKAKMDCVPIKRHSYVYWYHKTLEEELKFFIYFQNEEIIQKAEIINERFSAQCPQNSPCTLEIQSTESGDTARYFCANSK
47
+ Homosap TRBV23-1 TRBV23-1*01 L36092 (F) HAKVTQTPGHLVKGKGQKTKMDCTPEKGHTFVYWYQQNQNKEFMLLISFQNEQVLQETEMHKKRFSSQCPKNAPCSLAILSSEPGDTALYLCASSQ
48
+ Homosap TRBV24-1 TRBV24-1*01 M11951 F DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLRLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL
49
+ Homosap TRBV24-1 TRBV24-1*02 IMGT000021 F DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLQLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL
50
+ Homosap TRBV24/OR9-2 TRBV24/OR9-2*01 L05153 F DADVTQTPRNRITKTGKRIMLECSQTKGHDRMYWYRQDPGLGLQLIYYSFDVKDINKGEISDGYSVSRQAQAKFSLSLESAIPNQTALYFCATSDL
51
+ Homosap TRBV25-1 TRBV25-1*01 L36092 F EADIYQTPRYLVIGTGKKITLECSQTMGHDKMYWYQQDPGMELHLIHYSYGVNSTEKGDLSSESTVSRIRTEHFPLTLESARPSHTSQYLCASSE
52
+ Homosap TRBV27 TRBV27*01 L36092 F EAQVTQNPRYLITVTGKKLTVTCSQNMNHEYMSWYRQDPGLGLRQIYYSMNVEVTDKGDVPEGYKVSRKEKRNFPLILESPSPNQTSLYFCASSL
53
+ Homosap TRBV28 TRBV28*01 U08314 F DVKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGLGLRLIYFSYDVKMKEKGDIPEGYSVSREKKERFSLILESASTNQTSMYLCASSL
54
+ Homosap TRBV29-1 TRBV29-1*01 L36092 F SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVE
55
+ Homosap TRBV29-1 TRBV29-1*02 M13847 (F) SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSSLTVSNMSPEDSSIYLCSVE
56
+ Homosap TRBV3-1 TRBV3-1*01 U07977 F DTAVSQTPKYLVTQMGNDKSIKCEQNLGHDTMYWYKQDSKKFLKIMFSYNNKELIINETVPNRFSPKSPDKAHLNLHINSLELGDSAVYFCASSQ
57
+ Homosap TRBV3-1 TRBV3-1*02 L06889 (F) DTAVSQTPKYLVTQMGNDKSIKCEQNLGHDTMYWYKQDSKKFLKIMFSYNNKEIIINETVPNRFSPKSPDKAKLNLHINSLELGDSAVYFCAS
58
+ Homosap TRBV30 TRBV30*01 L36092 F SQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRGLQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWS
59
+ Homosap TRBV30 TRBV30*02 Z13967 F SQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRGLQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWS
60
+ Homosap TRBV30 TRBV30*05 L06893 (F) SQTIHQWPATLVQPVGSPLSLECTVEGTSNPNLYWYRQAAGRGLQLLFYSVGIGQISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYLCAWG
61
+ Homosap TRBV4-1 TRBV4-1*01 U07977 F DTEVTQTPKHLVMGMTNKKSLKCEQHMGHRAMYWYKQKAKKPPELMFVYSYEKLSINESVPSRFSPECPNSSLLNLHLHALQPEDSALYLCASSQ
62
+ Homosap TRBV4-2 TRBV4-2*01 U07975 F ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYNFKEQTENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ
63
+ Homosap TRBV4-2 TRBV4-2*02 X58811 (F) ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYNFKEQTENNSVPSRFSPECPNSSHLCLHLHTLQPEDSALYLCAST
64
+ Homosap TRBV4-3 TRBV4-3*01 U07978 F ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYSLEERVENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASSQ
65
+ Homosap TRBV4-3 TRBV4-3*02 X58812 (F) ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYSLEERVENNSVPSRFSPECPNSSHLSLHLHTLQPEDSALYLCASS
66
+ Homosap TRBV4-3 TRBV4-3*03 L06888 (F) ETGVTQTPRHLVMGMTNKKSLKCEQHLGHNAMYWYKQSAKKPLELMFVYSLEERVENNSVPSRFSPECPNSSHLFLHLHTLQPEDSALYLCASS
67
+ Homosap TRBV5-1 TRBV5-1*01 L36092 F KAGVTQTPRYLIKTRGQQVTLSCSPISGHRSVSWYQQTPGQGLQFLFEYFSETQRNKGNFPGRFSGRQFSNSRSEMNVSTLELGDSALYLCASSL
68
+ Homosap TRBV5-1 TRBV5-1*02 M14271 (F) RAGVTQTPRHLIKTRGQQVTLGCSPISGHRSVSWYQQTLGQGLQFLFEYFSETQRNKGNFLGRFSGRQFSNSRSEMNVSTLELGDSALYLCAS
69
+ Homosap TRBV5-3 TRBV5-3*01 L36092 F EAGVTQSPTHLIKTRGQQVTLRCSPISGHSSVSWYQQAPGQGPQFIFEYANELRRSEGNFPNRFSGRQFHDCCSEMNVSALELGDSALYLCARSL
70
+ Homosap TRBV5-4 TRBV5-4*01 L36092 F ETGVTQSPTHLIKTRGQQVTLRCSSQSGHNTVSWYQQALGQGPQFIFQYYREEENGRGNFPPRFSGLQFPNYSSELNVNALELDDSALYLCASSL
71
+ Homosap TRBV5-4 TRBV5-4*02 X57615 (F) ETGVTQSPTHLIKTRGQQVTLRCSSQSGHNTVSWYQQALGQGPQFIFQYYREEENGRGNFPPRFSGLQFPNYNSELNVNALELDDSALYLCASS
72
+ Homosap TRBV5-5 TRBV5-5*01 L36092 F DAGVTQSPTHLIKTRGQQVTLRCSPISGHKSVSWYQQVLGQGPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASSL
73
+ Homosap TRBV5-5 TRBV5-5*02 X57611 (F) DAGVTQSPTHLIKTRGQHVTLRCSPISGHKSVSWYQQVLGQGPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASS
74
+ Homosap TRBV5-5 TRBV5-5*03 X58801 (F) DAGVTQSPTHLIKTRGQQVTLRCSPISEHKSVSWYQQVLGQGPQFIFQYYEKEERGRGNFPDRFSARQFPNYSSELNVNALLLGDSALYLCASS
75
+ Homosap TRBV5-6 TRBV5-6*01 L36092 F DAGVTQSPTHLIKTRGQQVTLRCSPKSGHDTVSWYQQALGQGPQFIFQYYEEEERQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYLCASSL
76
+ Homosap TRBV5-7 TRBV5-7*01 L36092 F DAGVTQSPTHLIKTRGQHVTLRCSPISGHTSVSSYQQALGQGPQFIFQYYEKEERGRGNFPDQFSGHQFPNYSSELNVNALLLGDSALYLCASSL
77
+ Homosap TRBV5-8 TRBV5-8*01 L36092 F EAGVTQSPTHLIKTRGQQATLRCSPISGHTSVYWYQQALGLGLQFLLWYDEGEERNRGNFPPRFSGRQFPNYSSELNVNALELEDSALYLCASSL
78
+ Homosap TRBV6-1 TRBV6-1*01 X61446 F NAGVTQTPKFQVLKTGQSMTLQCAQDMNHNSMYWYRQDPGMGLRLIYYSASEGTTDKGEVPNGYNVSRLNKREFSLRLESAAPSQTSVYFCASSE
79
+ Homosap TRBV6-2 TRBV6-2*01 X61445 F NAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGMGLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY
80
+ Homosap TRBV6-3 TRBV6-3*01 U07978 F NAGVTQTPKFRVLKTGQSMTLLCAQDMNHEYMYWYRQDPGMGLRLIHYSVGEGTTAKGEVPDGYNVSRLKKQNFLLGLESAAPSQTSVYFCASSY
81
+ Homosap TRBV6-4 TRBV6-4*01 X61653 F IAGITQAPTSQILAAGRRMTLRCTQDMRHNAMYWYRQDLGLGLRLIHYSNTAGTTGKGEVPDGYSVSRANTDDFPLTLASAVPSQTSVYFCASSD
82
+ Homosap TRBV6-4 TRBV6-4*02 AF009660 F TAGITQAPTSQILAAGRSMTLRCTQDMRHNAMYWYRQDLGLGLRLIHYSNTAGTTGKGEVPDGYSVSRANTDDFPLTLASAVPSQTSVYFCASSD
83
+ Homosap TRBV6-5 TRBV6-5*01 L36092 F NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSY
84
+ Homosap TRBV6-6 TRBV6-6*01 L36092 F NAGVTQTPKFRILKIGQSMTLQCTQDMNHNYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASSY
85
+ Homosap TRBV6-6 TRBV6-6*02 AF009662 F NAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASSY
86
+ Homosap TRBV6-6 TRBV6-6*03 X58815 (F) NAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASS
87
+ Homosap TRBV6-6 TRBV6-6*04 X74848 (F) NAGVTQTPKFRILKIGQSMTLQCTQDMNHEYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAPSQTSVYFCASSR
88
+ Homosap TRBV6-6 TRBV6-6*05 L06892 (F) NAGVTQTPKFRILKIGQSMTLQCAQDMNHNYMYWYRQDPGMGLKLIYYSVGAGITDKGEVPNGYNVSRSTTEDFPLRLELAAASQTSVYFCASS
89
+ Homosap TRBV6-7 TRBV6-7*01 L36092 F NAGVTQTPKFHVLKTGQSMTLLCAQDMNHEYMYRYRQDPGKGLRLIYYSVAAALTDKGEVPNGYNVSRSNTEDFPLKLESAAPSQTSVYFCASSY
90
+ Homosap TRBV6-8 TRBV6-8*01 L36092 F NAGVTQTPKFHILKTGQSMTLQCAQDMNHGYMSWYRQDPGMGLRLIYYSAAAGTTDKEVPNGYNVSRLNTEDFPLRLVSAAPSQTSVYLCASSY
91
+ Homosap TRBV6-9 TRBV6-9*01 X61447 F NAGVTQTPKFHILKTGQSMTLQCAQDMNHGYLSWYRQDPGMGLRRIHYSVAAGITDKGEVPDGYNVSRSNTEDFPLRLESAAPSQTSVYFCASSY
92
+ Homosap TRBV7-1 TRBV7-1*01 X61444 F GAGVSQSLRHKVAKKGKDVALRYDPISGHNALYWYRQSLGQGLEFPIYFQGKDAADKSGLPRDRFSAQRSEGSISTLKFQRTQQGDLAVYLCASSS
93
+ Homosap TRBV7-2 TRBV7-2*01 X61442 F GAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGGSVSTLTIQRTQQEDSAVYLCASSL
94
+ Homosap TRBV7-2 TRBV7-2*02 L36190 F GAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQRLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGESVSTLTIQRTQQEDSAVYLCASSL
95
+ Homosap TRBV7-2 TRBV7-2*03 U07975 F GAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQRLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGESVSTLTIQRTQQEDSAVYLCTSSL
96
+ Homosap TRBV7-2 TRBV7-2*04 M27387 (F) GAGVSQSPSNKVTEKGKDVELRCDPISGHTALYWYRQSLGQGLEFLIYFQGNSAPDKSGLPSDRFSAERTGGSVSTLTIQRTQQEDSAVYLCASSL
97
+ Homosap TRBV7-3 TRBV7-3*01 X61440 F GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFLIYFQGTGAADDSGLPNDRFFAVRPEGSVSTLKIQRTERGDSAVYLCASSL
98
+ Homosap TRBV7-3 TRBV7-3*04 X74843 (F) GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFLIYFQGTGAADDSGLPNDRFFAVRPEGSVSTLKIQRTERGDSAVYLCASS
99
+ Homosap TRBV7-4 TRBV7-4*01 L36092 F GAGVSQSPRYKVAKRGRDVALRCDSISGHVTLYWYRQTLGQGSEVLTYSQSDAQRDKSGRPSGRFSAERPERSVSTLKIQRTEQGDSAVYLCASSL
100
+ Homosap TRBV7-6 TRBV7-6*01 L36092 F GAGVSQSPRYKVTKRGQDVALRCDPISGHVSLYWYRQALGQGPEFLTYFNYEAQQDKSGLPNDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL
101
+ Homosap TRBV7-6 TRBV7-6*02 X58806 (F) GAGVSQSPRYKVTKRGQDVALRCDPISGHVSLYWYRQALGQGPEFLTYFNYEAQQDKSGLPNDRFSAERPEGSISTLTIQRTEQRDSAMYRCASS
102
+ Homosap TRBV7-7 TRBV7-7*01 L36092 F GAGVSQSPRYKVTKRGQDVTLRCDPISSHATLYWYQQALGQGPEFLTYFNYEAQPDKSGLPSDRFSAERPEGSISTLTIQRTEQRDSAMYRCASSL
103
+ Homosap TRBV7-7 TRBV7-7*02 X57607 (F) GAGVSQSPRYKVTKRGQDVTLRCDPISSHVTLYWYQQALGQGPEFLTYFNYEAQPDKSGLPSDRFSAERPEGSISTLTIQRTEQRDSAMYRCASS
104
+ Homosap TRBV7-8 TRBV7-8*01 M11953 F GAGVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTYFQNEAQLDKSGLPSDRFFAERPEGSVSTLKIQRTQQEDSAVYLCASSL
105
+ Homosap TRBV7-8 TRBV7-8*02 X61441 F GAGVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTYFQNEAQLDKSGLPSDRFFAERPEGSVSTLKIQRTQKEDSAVYLCASSL
106
+ Homosap TRBV7-8 TRBV7-8*03 M27384 (F) GAGVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTYFQNEAQLDKSGLPSDRFFAERPEGSVSTLKIQRTQQEDSAVYLCASSR
107
+ Homosap TRBV7-9 TRBV7-9*01 L36092 F DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL
108
+ Homosap TRBV7-9 TRBV7-9*02 M15564 (F) DTGVSQNPRHNITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL
109
+ Homosap TRBV7-9 TRBV7-9*03 AF009663 F DTGVSQDPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSL
110
+ Homosap TRBV7-9 TRBV7-9*04 M14261 (F) ISGVSHNPRHKITKRGQNVTFRCDPISEHNRLYWYRQNPGQGPEFLTYFQNEAQLEKSGLLSDRISAERPKGSFSTLEIQRTEQGDSAMYLCASS
111
+ Homosap TRBV7-9 TRBV7-9*05 M27385 (F) DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSLSTLEIQRTEQGDSAMYLCASTK
112
+ Homosap TRBV7-9 TRBV7-9*06 X74844 (F) DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSLSTLEIQRTEQGDSAMYLCASTL
113
+ Homosap TRBV9 TRBV9*01 L36092 F DSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASSV
114
+ Homosap TRBV9 TRBV9*02 AF009660 F DSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIHYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASSV
115
+ Homosap TRBV9 TRBV9*03 M27380 (F) DSGVTQTPKHLITATGQRVTLRCSPRSGDLSVYWYQQSLDQGLQFLIQYYNGEERAKGNILERFSAQQFPDLHSELNLSSLELGDSALYFCASS
src/library/trbvs_nt.tsv ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Species Gene Allele AccNum Functionality nt_seq
2
+ Homosap TRBV10-1 TRBV10-1*01 L36092 F gatgctgaaatcacccagagcccaagacacaagatcacagagacaggaaggcaggtgaccttggcgtgtcaccagacttggaaccacaacaatatgttctggtatcgacaagacctgggacatgggctgaggctgatccattactcatatggtgttcaagacactaacaaaggagaagtctcagatggctacagtgtctctagatcaaacacagaggacctccccctcactctggagtctgctgcctcctcccagacatctgtatatttctgcgccagcagtgagtc
3
+ Homosap TRBV10-1 TRBV10-1*02 AF009660 F gatgctgaaatcacccagagcccaagacacaagatcacagagacaggaaggcaggtgaccttggcgtgtcaccagacttggaaccacaacaatatgttctggtatcgacaagacctgggacatgggctgaggctgatccattactcatatggtgttcacgacactaacaaaggagaagtctcagatggctacagtgtctctagatcaaacacagaggacctccccctcactctggagtctgctgcctcctcccagacatctgtatatttctgcgccagcagtgagtc
4
+ Homosap TRBV10-2 TRBV10-2*01 L36092 F gatgctggaatcacccagagcccaagatacaagatcacagagacaggaaggcaggtgaccttgatgtgtcaccagacttggagccacagctatatgttctggtatcgacaagacctgggacatgggctgaggctgatctattactcagcagctgctgatattacagataaaggagaagtccccgatggctatgttgtctccagatccaagacagagaatttccccctcactctggagtcagctacccgctcccagacatctgtgtatttctgcgccagcagtgagtc
5
+ Homosap TRBV10-2 TRBV10-2*02 IMGT000021 F gatgctggaatcacccagagcccaagatacaagatcacagagacaggaaggcaggtgaccttgatgtgtcaccagacttggagccacagctatatgttctggtatcgacaagacctgggacatgggctgaggctgatctattactcagcagctgctgatattacagataaaggagaagtccccgatggctacgttgtctccagatccaagacagagaatttccccctcactctggagtcagctacccgctcccagacatctgtgtatttctgcgccagcagtgagtc
6
+ Homosap TRBV10-3 TRBV10-3*01 U03115 F gatgctggaatcacccagagcccaagacacaaggtcacagagacaggaacaccagtgactctgagatgtcaccagactgagaaccaccgctatatgtactggtatcgacaagacccggggcatgggctgaggctgatccattactcatatggtgttaaagatactgacaaaggagaagtctcagatggctatagtgtctctagatcaaagacagaggatttcctcctcactctggagtccgctaccagctcccagacatctgtgtacttctgtgccatcagtgagtc
7
+ Homosap TRBV10-3 TRBV10-3*02 U17047 F gatgctggaatcacccagagcccaagacacaaggtcacagagacaggaacaccagtgactctgagatgtcatcagactgagaaccaccgctatatgtactggtatcgacaagacccggggcatgggctgaggctgatccattactcatatggtgttaaagatactgacaaaggagaagtctcagatggctatagtgtctctagatcaaagacagaggatttcctcctcactctggagtccgctaccagctcccagacatctgtgtacttctgtgccatcagtgagtc
8
+ Homosap TRBV10-3 TRBV10-3*03 L33101 [F] gatgctggaatcacccagagcccaagacacaaggtcacagagacaggaacaccagtgactctgagatgtcaccagactgagaaccaccgctacatgtactggtatcgacaagacccggggcatgggctgaggctaatccattactcatatggtgttaaagatactgacaaaggagaagtctcagatggctatagtgtctctagatcaaagacagaggatttcctcctcactctggagtccgctaccagctcccagacatctgtgtacttctgt
9
+ Homosap TRBV10-3 TRBV10-3*04 L33102 [F] gatgctggaatcacccagagcccaagacacaaggtcacagagacaggaacaccagtgactctgagatgtcaccagactgagaaccaccgctacatgtactggtatcgacaagacccggggcatgggctgaggctgatccattactcatatggtgttaaagatactgacaaaggagaagtctcagatggctatagtgtctctagatcaaagacagaggatttcctcctcactctggagtccgctaccagctcccagacatctgtgtacttctgt
10
+ Homosap TRBV11-1 TRBV11-1*01 M33233 F gaagctgaagttgcccagtcccccagatataagattacagagaaaagccaggctgtggctttttggtgtgatcctatttctggccatgctaccctttactggtaccggcagatcctgggacagggcccggagcttctggttcaatttcaggatgagagtgtagtagatgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcagagcttggggactcggccatgtatctctgtgccagcagcttagc
11
+ Homosap TRBV11-2 TRBV11-2*01 L36092 F gaagctggagttgcccagtctcccagatataagattatagagaaaaggcagagtgtggctttttggtgcaatcctatatctggccatgctaccctttactggtaccagcagatcctgggacagggcccaaagcttctgattcagtttcagaataacggtgtagtggatgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcaaagcttgaggactcggccgtgtatctctgtgccagcagcttaga
12
+ Homosap TRBV11-2 TRBV11-2*02 M33235 [F] gaagctggagttgcccagtctcccagatataagattatagagaaaaggcagagtgtggctttttggtgcaatcctatatctggccatgctaccctttactggtaccagcagatcctgggacagggcccaaagcttctgattcagtttcagaataacggtgtagtggatgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcaaagcttgagaactcggccgtgtatctctgtgccagcagt
13
+ Homosap TRBV11-2 TRBV11-2*03 IMGT000021 F gaagctggagttgcccagtctcccagatataagattatagagaaaaggcagagtgtggctttttggtgcaatcctatatctggccatgctaccctttactggtaccagcagatcctgggacagggcccaaagcttctgattcagtttcagaataacggtgtagtggatgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccaacctgcaaagcttgaggactcggccgtgtatctctgtgccagcagcttaga
14
+ Homosap TRBV11-3 TRBV11-3*01 U03115 F gaagctggagtggttcagtctcccagatataagattatagagaaaaaacagcctgtggctttttggtgcaatcctatttctggccacaataccctttactggtacctgcagaacttgggacagggcccggagcttctgattcgatatgagaatgaggaagcagtagacgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcagagcttggggactcggccgtgtatctctgtgccagcagcttaga
15
+ Homosap TRBV11-3 TRBV11-3*02 X58797 (F) gaagctggagtggttcagtctcccagatataagattatagagaaaaagcagcctgtggctttttggtgcaatcctatttctggccacaataccctttactggtaccggcagaacttgggacagggcccggagcttctgattcgatatgagaatgaggaagcagtagacgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcagagcttggggactcggccgtgtatctctgtgccagcagc
16
+ Homosap TRBV11-3 TRBV11-3*04 AB305924 (F) gaagctggagtggttcagtctcccagatataagattatagagaaaaaacagcctgtggctttttggtgcaatcctatttctggccacaataccctttactggtaccggcagaacttgggacagggcccggagcttctgattcgatatgagaatgaggaagcagtagacgattcacagttgcctaaggatcgattttctgcagagaggctcaaaggagtagactccactctcaagatccagcctgcagagcttggggactcggccgtgtatctctgtgccagcagcttag
17
+ Homosap TRBV12-3 TRBV12-3*01 X07192 F gatgctggagttatccagtcaccccgccatgaggtgacagagatgggacaagaagtgactctgagatgtaaaccaatttcaggccacaactcccttttctggtacagacagaccatgatgcggggactggagttgctcatttactttaacaacaacgttccgatagatgattcagggatgcccgaggatcgattctcagctaagatgcctaatgcatcattctccactctgaagatccagccctcagaacccagggactcagctgtgtacttctgtgccagcagtttagc
18
+ Homosap TRBV12-4 TRBV12-4*01 K02546 F gatgctggagttatccagtcaccccggcacgaggtgacagagatgggacaagaagtgactctgagatgtaaaccaatttcaggacacgactaccttttctggtacagacagaccatgatgcggggactggagttgctcatttactttaacaacaacgttccgatagatgattcagggatgcccgaggatcgattctcagctaagatgcctaatgcatcattctccactctgaagatccagccctcagaacccagggactcagctgtgtacttctgtgccagcagtttagc
19
+ Homosap TRBV12-4 TRBV12-4*02 M14264 (F) gatgctggagttatccagtcaccccggcacgaggtgacagagatgggacaagaagtgactctgagatgtaaaccaatttcaggacatgactaccttttctggtacagacagaccatgatgcggggactggagttgctcatttactttaacaacaacgttccgatagatgattcagggatgcccgaggatcgattctcagctaagatgcctaatgcatcattctccactctgaggatccagccctcagaacccagggactcagctgtgtacttctgtgccagcagttta
20
+ Homosap TRBV12-5 TRBV12-5*01 X07223 F gatgctagagtcacccagacaccaaggcacaaggtgacagagatgggacaagaagtaacaatgagatgtcagccaattttaggccacaatactgttttctggtacagacagaccatgatgcaaggactggagttgctggcttacttccgcaaccgggctcctctagatgattcggggatgccgaaggatcgattctcagcagagatgcctgatgcaactttagccactctgaagatccagccctcagaacccagggactcagctgtgtatttttgtgctagtggtttggt
21
+ Homosap TRBV13 TRBV13*01 U03115 F gctgctggagtcatccagtccccaagacatctgatcaaagaaaagagggaaacagccactctgaaatgctatcctatccctagacacgacactgtctactggtaccagcagggtccaggtcaggacccccagttcctcatttcgttttatgaaaagatgcagagcgataaaggaagcatccctgatcgattctcagctcaacagttcagtgactatcattctgaactgaacatgagctccttggagctgggggactcagccctgtacttctgtgccagcagcttagg
22
+ Homosap TRBV13 TRBV13*02 M62378 (F) gctgctggagtcatccagtccccaagacatctgatcagagaaaagagggaaacagccactctgaaatgctatcctatccctagacacgacactgtctactggtaccagcagggcccaggtcaggacccccagttcttcatttcgttttatgaaaagatgcagagcgataaaggaagcatccctgatcgattctcagctcaacagttcagtgactatcattctgaactgaacatgagctccttggagctgggggactcagccctgtacttctgtgccagcagc
23
+ Homosap TRBV14 TRBV14*01 X06154 F gaagctggagttactcagttccccagccacagcgtaatagagaagggccagactgtgactctgagatgtgacccaatttctggacatgataatctttattggtatcgacgtgttatgggaaaagaaataaaatttctgttacattttgtgaaagagtctaaacaggatgagtccggtatgcccaacaatcgattcttagctgaaaggactggagggacgtattctactctgaaggtgcagcctgcagaactggaggattctggagtttatttctgtgccagcagccaaga
24
+ Homosap TRBV14 TRBV14*02 X57722 (F) gaagctggagttactcagttccccagccacagcgtaatagagaagggccagactgtgactctgagatgtgacccaatttctggacatgataatctttattggtatcgacgtgttatgggaaaagaaataaaatttctgttacattttgtgaaagagtctaaacaggatgaatccggtatgcccaacaatcgattcttagctgaaaggactggagggacgtattctactctgaaggtgcagcctgcagaactggaggattctggagtttatttctgtgccagcagc
25
+ Homosap TRBV15 TRBV15*01 U03115 F gatgccatggtcatccagaacccaagataccaggttacccagtttggaaagccagtgaccctgagttgttctcagactttgaaccataacgtcatgtactggtaccagcagaagtcaagtcaggccccaaagctgctgttccactactatgacaaagattttaacaatgaagcagacacccctgataacttccaatccaggaggccgaacacttctttctgctttcttgacatccgctcaccaggcctgggggacacagccatgtacctgtgtgccaccagcagaga
26
+ Homosap TRBV15 TRBV15*02 IMGT000021 F gatgccatggtcatccagaacccaagataccaggttacccagtttggaaagccagtgaccctgagttgttctcagactttgaaccataacgtcatgtactggtaccagcagaagtcaagtcaggccccaaagctgctgttccactactatgacaaagattttaacaatgaagcagacacccctgataacttccaatccaggaggccgaacacttctttctgctttcttgacatccgctcaccaggcctgggggacgcagccatgtacctgtgtgccaccagcagaga
27
+ Homosap TRBV15 TRBV15*03 M62376 (F) gatgccatggtcatccagaacccaagataccgggttacccagtttggaaagccagtgaccctgagttgttctcagactttgaaccataacgtcatgtactggtaccagcagaagtcaagtcaggccccaaagctgctgttccactactataacaaagattttaacaatgaagcagacacccctgataacttccaatccaggaggccgaacacttctttctgctttctagacatccgctcaccaggcctgggggacgcagccatgtaccagtgtgccaccagc
28
+ Homosap TRBV16 TRBV16*01 L26231 F ggtgaagaagtcgcccagactccaaaacatcttgtcagaggggaaggacagaaagcaaaattatattgtgccccaataaaaggacacagttatgttttttggtaccaacaggtcctgaaaaacgagttcaagttcttgatttccttccagaatgaaaatgtctttgatgaaacaggtatgcccaaggaaagattttcagctaagtgcctcccaaattcaccctgtagccttgagatccaggctacgaagcttgaggattcagcagtgtatttttgtgccagcagccaatc
29
+ Homosap TRBV16 TRBV16*03 L26054 (F) ggtgaagaagtcgcccagactccaaaacatcttgtcagaggggaaggacagaaagcaaaattatattgtgccccaataaaaggacacagttatgttttttggtaccaacaggtcctgaaaaacgagttcaagttcttggtttccttccagaatgaaaatgtctttgatgaaacaggtatgcccaaggaaagattttcagctaagtgcctcccaaattcaccctgtagccttgagatccaggctacgaagcttgaggattcagcagtgtatttttgtgccagcagc
30
+ Homosap TRBV18 TRBV18*01 L36092 F aatgccggcgtcatgcagaacccaagacacctggtcaggaggaggggacaggaggcaagactgagatgcagcccaatgaaaggacacagtcatgtttactggtatcggcagctcccagaggaaggtctgaaattcatggtttatctccagaaagaaaatatcatagatgagtcaggaatgccaaaggaacgattttctgctgaatttcccaaagagggccccagcatcctgaggatccagcaggtagtgcgaggagattcggcagcttatttctgtgccagctcaccacc
31
+ Homosap TRBV19 TRBV19*01 L36092 F gatggtggaatcactcagtccccaaagtacctgttcagaaaggaaggacagaatgtgaccctgagttgtgaacagaatttgaaccacgatgccatgtactggtaccgacaggacccagggcaagggctgagattgatctactactcacagatagtaaatgactttcagaaaggagatatagctgaagggtacagcgtctctcgggagaagaaggaatcctttcctctcactgtgacatcggcccaaaagaacccgacagctttctatctctgtgccagtagtataga
32
+ Homosap TRBV19 TRBV19*02 U48259 F gatggtggaatcactcagtccccaaagtacctgttcagaaaggaaggacagaatgtgaccctgagttgtgaacagaatttgaaccacgatgccatgtactggtaccgacaggtcccagggcaagggctgagattgatctactactcacacatagtaaatgactttcagaaaggagatatagctgaagggtacagcgtctctcgggagaagaaggaatcctttcctctcactgtgacatcggcccaaaagaacccgacagctttctatctctgtgccagtagtataga
33
+ Homosap TRBV19 TRBV19*03 M97725 (F) gatggtggaatcactcagtccccaaagtacctgttcagaaaggaaggacagaatgtgaccctgagttgtgaacagaatttgaaccacgatgccatgtactggtaccgacaggacccagggcaagggctgagattgatctactactcacacatagtaaatgactttcagaaaggagatatagctgaagggtacagcgtctctcgggagaagaaggaatcctttcctctcactgtgacatcggcccaaaagaacccgacagctttctatctctgtgccagtag
34
+ Homosap TRBV2 TRBV2*01 L36092 F gaacctgaagtcacccagactcccagccatcaggtcacacagatgggacaggaagtgatcttgcgctgtgtccccatctctaatcacttatacttctattggtacagacaaatcttggggcagaaagtcgagtttctggtttccttttataataatgaaatctcagagaagtctgaaatattcgatgatcaattctcagttgaaaggcctgatggatcaaatttcactctgaagatccggtccacaaagctggaggactcagccatgtacttctgtgccagcagtgaagc
35
+ Homosap TRBV2 TRBV2*02 M62379 (F) gaacctgaagtcacccagactcccagccatcaggtcacacagatgggacaggaagtgatcttgcactgtgtccccatctctaatcacttatacttctattggtacagacaaatcttggggcagaaagtcgagtttctggtttccttttataataatgaaatctcagagaagtctgaaatattcgatgatcaattctcagttgaaaggcctgatggatcaaatttcactctgaagatccggtccacaaagctggaggactcagccatgtacttctgtgccagcagt
36
+ Homosap TRBV2 TRBV2*03 M64351 (F) gaacctgaagtcacccagactcccagccatcaggtcacacagatgggacaggaagtgatcttgcgctgtgtccccatctctaatcacttatacttctattggtacagacaaatcttggggcagaaagtcgagtttctggtttccttttataataatgaaatctcagagaagtctgaaatattcgatgatcaattctcagttgagaggcctgatggatcaaatttcactctgaagatccggtccacaaagctggaggactcagccatgtacttctgtgccagcagtgaa
37
+ Homosap TRBV20-1 TRBV20-1*01 M11955 F ggtgctgtcgtctctcaacatccgagctgggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaacagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgctagaga
38
+ Homosap TRBV20-1 TRBV20-1*02 X72719 F ggtgctgtcgtctctcaacatccgagcagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaacagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgct
39
+ Homosap TRBV20-1 TRBV20-1*03 M11954 (F) ggtgctgtcgtctctcaacatccgagctgggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaacagagtctcatgctgatggcaacttccaatgagggctgcaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgct
40
+ Homosap TRBV20-1 TRBV20-1*04 M14263 (F) ggtgctgtcgtctctcaacatccgagcagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccttggactttcaggccacaactatgttttggtatcgtcagttcccgaaaaagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgctagt
41
+ Homosap TRBV20-1 TRBV20-1*05 X57604 (F) ggtgctgtcgtctctcaacatccgagcagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaaaagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgctaga
42
+ Homosap TRBV20-1 TRBV20-1*06 D13088 (F) ggtgctgtcgtctctcaacatccgagtagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaaaagagtctcatgctgatggcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgct
43
+ Homosap TRBV20-1 TRBV20-1*07 X74852 (F) ggtgctgtcgtctctcaacatccgagcagggttatctgtaagagtggaacctctgtgaagatcgagtgccgttccctggactttcaggccacaactatgttttggtatcgtcagttcccgaaaaagagtctcatgcagatcgcaacttccaatgagggctccaaggccacatacgagcaaggcgtcgagaaggacaagtttctcatcaaccatgcaagcctgaccttgtccactctgacagtgaccagtgcccatcctgaagacagcagcttctacatctgcagtgctaga
44
+ Homosap TRBV24-1 TRBV24-1*01 M11951 F gatgctgatgttacccagaccccaaggaataggatcacaaagacaggaaagaggattatgctggaatgttctcagactaagggtcatgatagaatgtactggtatcgacaagacccaggactgggcctacggttgatctattactcctttgatgtcaaagatataaacaaaggagagatctctgatggatacagtgtctctcgacaggcacaggctaaattctccctgtccctagagtctgccatccccaaccagacagctctttacttctgtgccaccagtgatttg
45
+ Homosap TRBV24-1 TRBV24-1*02 IMGT000021 F gatgctgatgttacccagaccccaaggaataggatcacaaagacaggaaagaggattatgctggaatgttctcagactaagggtcatgatagaatgtactggtatcgacaagacccaggactgggcctacagttgatctattactcctttgatgtcaaagatataaacaaaggagagatctctgatggatacagtgtctctcgacaggcacaggctaaattctccctgtccctagagtctgccatccccaaccagacagctctttacttctgtgccaccagtgatttg
46
+ Homosap TRBV25-1 TRBV25-1*01 L36092 F gaagctgacatctaccagaccccaagataccttgttatagggacaggaaagaagatcactctggaatgttctcaaaccatgggccatgacaaaatgtactggtatcaacaagatccaggaatggaactacacctcatccactattcctatggagttaattccacagagaagggagatctttcctctgagtcaacagtctccagaataaggacggagcattttcccctgaccctggagtctgccaggccctcacatacctctcagtacctctgtgccagcagtgaata
47
+ Homosap TRBV27 TRBV27*01 L36092 F gaagcccaagtgacccagaacccaagatacctcatcacagtgactggaaagaagttaacagtgacttgttctcagaatatgaaccatgagtatatgtcctggtatcgacaagacccagggctgggcttaaggcagatctactattcaatgaatgttgaggtgactgataagggagatgttcctgaagggtacaaagtctctcgaaaagagaagaggaatttccccctgatcctggagtcgcccagccccaaccagacctctctgtacttctgtgccagcagtttatc
48
+ Homosap TRBV28 TRBV28*01 U08314 F gatgtgaaagtaacccagagctcgagatatctagtcaaaaggacgggagagaaagtttttctggaatgtgtccaggatatggaccatgaaaatatgttctggtatcgacaagacccaggtctggggctacggctgatctatttctcatatgatgttaaaatgaaagaaaaaggagatattcctgaggggtacagtgtctctagagagaagaaggagcgcttctccctgattctggagtccgccagcaccaaccagacatctatgtacctctgtgccagcagtttatg
49
+ Homosap TRBV29-1 TRBV29-1*01 L36092 F agtgctgtcatctctcaaaagccaagcagggatatctgtcaacgtggaacctccctgacgatccagtgtcaagtcgatagccaagtcaccatgatgttctggtaccgtcagcaacctggacagagcctgacactgatcgcaactgcaaatcagggctctgaggccacatatgagagtggatttgtcattgacaagtttcccatcagccgcccaaacctaacattctcaactctgactgtgagcaacatgagccctgaagacagcagcatatatctctgcagcgttgaaga
50
+ Homosap TRBV29-1 TRBV29-1*02 M13847 (F) agtgctgtcatctctcaaaagccaagcagggatatctgtcaacgtggaacctccctgacgatccagtgtcaagtcgatagccaagtcaccatgatgttctggtaccgtcagcaacctggacagagcctgacactgatcgcaactgcaaatcagggctctgaggccacatatgagagtggatttgtcattgacaagtttcccatcagccgcccaaacctaacattctcaagtctgactgtgagcaacatgagccctgaagacagcagcatatatctctgcagcgttgaa
51
+ Homosap TRBV3-1 TRBV3-1*01 U07977 F gacacagctgtttcccagactccaaaatacctggtcacacagatgggaaacgacaagtccattaaatgtgaacaaaatctgggccatgatactatgtattggtataaacaggactctaagaaatttctgaagataatgtttagctacaataataaggagctcattataaatgaaacagttccaaatcgcttctcacctaaatctccagacaaagctcacttaaatcttcacatcaattccctggagcttggtgactctgctgtgtatttctgtgccagcagccaaga
52
+ Homosap TRBV3-1 TRBV3-1*02 L06889 (F) gacacagctgtttcccagactccaaaatacctggtcacacagatgggaaacgacaagtccattaaatgtgaacaaaatctgggccatgatactatgtattggtataaacaggactctaagaaatttctgaagataatgtttagctacaataacaaggagatcattataaatgaaacagttccaaatcgattctcacctaaatctccagacaaagctaaattaaatcttcacatcaattccctggagcttggtgactctgctgtgtatttctgtgccagc
53
+ Homosap TRBV30 TRBV30*01 L36092 F tctcagactattcatcaatggccagcgaccctggtgcagcctgtgggcagcccgctctctctggagtgcactgtggagggaacatcaaaccccaacctatactggtaccgacaggctgcaggcaggggcctccagctgctcttctactccgttggtattggccagatcagctctgaggtgccccagaatctctcagcctccagaccccaggaccggcagttcatcctgagttctaagaagctccttctcagtgactctggcttctatctctgtgcctggagtgt
54
+ Homosap TRBV30 TRBV30*02 Z13967 F tctcagactattcatcaatggccagcgaccctggtgcagcctgtgggcagcccgctctctctggagtgcactgtggagggaacatcaaaccccaacctatactggtaccgacaggctgcaggcaggggcctccagctgctcttctactccgttggtattggccagatcagctctgaggtgccccagaatctctcagcctccagaccccaggaccggcagttcatcctgagttctaagaagctcctcctcagtgactctggcttctatctctgtgcctggagtgt
55
+ Homosap TRBV30 TRBV30*05 L06893 (F) tctcagactattcatcaatggccagcgaccctggtgcagcctgtgggcagcccgctctccctggagtgcactgtggagggaacatcaaaccccaacctatactggtaccgacaggctgcaggacggggcctccagctgctcttctactccgttggtattggccagatcagctctgaggtgccccagaatctctcagcctccagaccccaggaccggcagttcatcctgagttctaagaagctccttctcagtgactctggcttctatctctgtgcctgggga
56
+ Homosap TRBV4-1 TRBV4-1*01 U07977 F gacactgaagttacccagacaccaaaacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatatggggcacagggctatgtattggtacaagcagaaagctaagaagccaccggagctcatgtttgtctacagctatgagaaactctctataaatgaaagtgtgccaagtcgcttctcacctgaatgccccaacagctctctcttaaaccttcacctacacgccctgcagccagaagactcagccctgtatctctgcgccagcagccaaga
57
+ Homosap TRBV4-2 TRBV4-2*01 U07975 F gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctggggcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacaactttaaagaacagactgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttattccttcacctacacaccctgcagccagaagactcggccctgtatctctgtgccagcagccaaga
58
+ Homosap TRBV4-2 TRBV4-2*02 X58811 (F) gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctggggcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacaactttaaagaacagactgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttatgccttcacctacacaccctgcagccagaagactcggccctgtatctctgtgccagcacc
59
+ Homosap TRBV4-3 TRBV4-3*01 U07978 F gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctgggtcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacagtcttgaagaacgggttgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttattccttcacctacacaccctgcagccagaagactcggccctgtatctctgcgccagcagccaaga
60
+ Homosap TRBV4-3 TRBV4-3*02 X58812 (F) gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctgggtcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacagtcttgaagaacgggttgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttatcccttcacctacacaccctgcagccagaagactcggccctgtatctctgcgccagcagc
61
+ Homosap TRBV4-3 TRBV4-3*03 L06888 (F) gaaacgggagttacgcagacaccaagacacctggtcatgggaatgacaaataagaagtctttgaaatgtgaacaacatctgggtcataacgctatgtattggtacaagcaaagtgctaagaagccactggagctcatgtttgtctacagtcttgaagaacgtgttgaaaacaacagtgtgccaagtcgcttctcacctgaatgccccaacagctctcacttattccttcacctacacaccctgcagccagaagactcggccctgtatctctgcgccagcagc
62
+ Homosap TRBV5-1 TRBV5-1*01 L36092 F aaggctggagtcactcaaactccaagatatctgatcaaaacgagaggacagcaagtgacactgagctgctcccctatctctgggcataggagtgtatcctggtaccaacagaccccaggacagggccttcagttcctctttgaatacttcagtgagacacagagaaacaaaggaaacttccctggtcgattctcagggcgccagttctctaactctcgctctgagatgaatgtgagcaccttggagctgggggactcggccctttatctttgcgccagcagcttgg
63
+ Homosap TRBV5-1 TRBV5-1*02 M14271 (F) agggctggggtcactcaaactccaagacatctgatcaaaacgagaggacagcaagtgacactgggctgctcccctatctctgggcataggagtgtatcctggtaccaacagaccctaggacagggccttcagttcctctttgaatacttcagtgagacacagagaaacaaaggaaacttccttggtcgattctcagggcgccagttctctaactctcgctctgagatgaatgtgagcaccttggagctgggggactcggccctttatctttgcgccagc
64
+ Homosap TRBV5-4 TRBV5-4*01 L36092 F gagactggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctcttctcagtctgggcacaacactgtgtcctggtaccaacaggccctgggtcaggggccccagtttatctttcagtattatagggaggaagagaatggcagaggaaacttccctcctagattctcaggtctccagttccctaattatagctctgagctgaatgtgaacgccttggagctggacgactcggccctgtatctctgtgccagcagcttgg
65
+ Homosap TRBV5-4 TRBV5-4*02 X57615 (F) gagactggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctcttctcagtctgggcacaacactgtgtcctggtaccaacaggccctgggtcaggggccccagtttatctttcagtattatagggaggaagagaatggcagaggaaacttccctcctagattctcaggtctccagttccctaattataactctgagctgaatgtgaacgccttggagctggacgactcggccctgtatctctgtgccagcagc
66
+ Homosap TRBV5-5 TRBV5-5*01 L36092 F gacgctggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctctcctatctctgggcacaagagtgtgtcctggtaccaacaggtcctgggtcaggggccccagtttatctttcagtattatgagaaagaagagagaggaagaggaaacttccctgatcgattctcagctcgccagttccctaactatagctctgagctgaatgtgaacgccttgttgctgggggactcggccctgtatctctgtgccagcagcttgg
67
+ Homosap TRBV5-5 TRBV5-5*02 X57611 (F) gacgctggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcacgtgactctgagatgctctcctatctctgggcacaagagtgtgtcctggtaccaacaggtcctgggtcaggggccccagtttatctttcagtattatgagaaagaagagagaggaagaggaaacttccctgatcgattctcagctcgccagttccctaactatagctctgagctgaatgtgaacgccttgttgctgggggactcggccctgtatctctgtgccagcagc
68
+ Homosap TRBV5-5 TRBV5-5*03 X58801 (F) gacgctggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctctcctatctctgagcacaagagtgtgtcctggtaccaacaggtcctgggtcaggggccccagtttatctttcagtattatgagaaagaagagagaggaagaggaaacttccctgatcgattctcagctcgccagttccctaactatagctctgagctgaatgtgaacgccttgttgctgggggactcggccctgtatctctgtgccagcagc
69
+ Homosap TRBV5-6 TRBV5-6*01 L36092 F gacgctggagtcacccaaagtcccacacacctgatcaaaacgagaggacagcaagtgactctgagatgctctcctaagtctgggcatgacactgtgtcctggtaccaacaggccctgggtcaggggccccagtttatctttcagtattatgaggaggaagagagacagagaggcaacttccctgatcgattctcaggtcaccagttccctaactatagctctgagctgaatgtgaacgccttgttgctgggggactcggccctctatctctgtgccagcagcttgg
70
+ Homosap TRBV5-8 TRBV5-8*01 L36092 F gaggctggagtcacacaaagtcccacacacctgatcaaaacgagaggacagcaagcgactctgagatgctctcctatctctgggcacaccagtgtgtactggtaccaacaggccctgggtctgggcctccagttcctcctttggtatgacgagggtgaagagagaaacagaggaaacttccctcctagattttcaggtcgccagttccctaattatagctctgagctgaatgtgaacgccttggagctggaggactcggccctgtatctctgtgccagcagcttgg
71
+ Homosap TRBV6-1 TRBV6-1*01 X61446 F aatgctggtgtcactcagaccccaaaattccaggtcctgaagacaggacagagcatgacactgcagtgtgcccaggatatgaaccataactccatgtactggtatcgacaagacccaggcatgggactgaggctgatttattactcagcttctgagggtaccactgacaaaggagaagtccccaatggctacaatgtctccagattaaacaaacgggagttctcgctcaggctggagtcggctgctccctcccagacatctgtgtacttctgtgccagcagtgaagc
72
+ Homosap TRBV6-2 TRBV6-2*01 X61445 F aatgctggtgtcactcagaccccaaaattccgggtcctgaagacaggacagagcatgacactgctgtgtgcccaggatatgaaccatgaatacatgtactggtatcgacaagacccaggcatggggctgaggctgattcattactcagttggtgagggtacaactgccaaaggagaggtccctgatggctacaatgtctccagattaaaaaaacagaatttcctgctggggttggagtcggctgctccctcccaaacatctgtgtacttctgtgccagcagttactc
73
+ Homosap TRBV6-3 TRBV6-3*01 U07978 F aatgctggtgtcactcagaccccaaaattccgggtcctgaagacaggacagagcatgacactgctgtgtgcccaggatatgaaccatgaatacatgtactggtatcgacaagacccaggcatggggctgaggctgattcattactcagttggtgagggtacaactgccaaaggagaggtccctgatggctacaatgtctccagattaaaaaaacagaatttcctgctggggttggagtcggctgctccctcccaaacatctgtgtacttctgtgccagcagttactc
74
+ Homosap TRBV6-4 TRBV6-4*01 X61653 F attgctgggatcacccaggcaccaacatctcagatcctggcagcaggacggcgcatgacactgagatgtacccaggatatgagacataatgccatgtactggtatagacaagatctaggactggggctaaggctcatccattattcaaatactgcaggtaccactggcaaaggagaagtccctgatggttatagtgtctccagagcaaacacagatgatttccccctcacgttggcgtctgctgtaccctctcagacatctgtgtacttctgtgccagcagtgactc
75
+ Homosap TRBV6-4 TRBV6-4*02 AF009660 F actgctgggatcacccaggcaccaacatctcagatcctggcagcaggacggagcatgacactgagatgtacccaggatatgagacataatgccatgtactggtatagacaagatctaggactggggctaaggctcatccattattcaaatactgcaggtaccactggcaaaggagaagtccctgatggttatagtgtctccagagcaaacacagatgatttccccctcacgttggcgtctgctgtaccctctcagacatctgtgtacttctgtgccagcagtgactc
76
+ Homosap TRBV6-5 TRBV6-5*01 L36092 F aatgctggtgtcactcagaccccaaaattccaggtcctgaagacaggacagagcatgacactgcagtgtgcccaggatatgaaccatgaatacatgtcctggtatcgacaagacccaggcatggggctgaggctgattcattactcagttggtgctggtatcactgaccaaggagaagtccccaatggctacaatgtctccagatcaaccacagaggatttcccgctcaggctgctgtcggctgctccctcccagacatctgtgtacttctgtgccagcagttactc
77
+ Homosap TRBV6-6 TRBV6-6*01 L36092 F aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtacccaggatatgaaccataactacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgataaaggagaagtcccgaatggctacaacgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctccctcccagacatctgtgtacttctgtgccagcagttactc
78
+ Homosap TRBV6-6 TRBV6-6*02 AF009662 F aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtgcccaggatatgaaccataactacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgacaaaggagaagtcccgaatggctacaacgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctccctcccagacatctgtgtacttctgtgccagcagttactc
79
+ Homosap TRBV6-6 TRBV6-6*03 X58815 (F) aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtgcccaggatatgaaccataactacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgataaaggagaagtcccgaatggctacaacgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctccctcccagacatctgtgtacttctgtgccagcagt
80
+ Homosap TRBV6-6 TRBV6-6*04 X74848 (F) aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtacccaggatatgaaccatgaatacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgataaaggagaagtcccgaatggctacaatgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctccctcccagacatctgtgtacttctgtgccagcagtcga
81
+ Homosap TRBV6-6 TRBV6-6*05 L06892 (F) aatgctggtgtcactcagaccccaaaattccgcatcctgaagataggacagagcatgacactgcagtgtgcccaggatatgaaccataactacatgtactggtatcgacaagacccaggcatggggctgaagctgatttattattcagttggtgctggtatcactgacaaaggagaagtcccgaatggctacaacgtctccagatcaaccacagaggatttcccgctcaggctggagttggctgctgcctcccagacatctgtgtacttctgtgccagcagc
82
+ Homosap TRBV6-8 TRBV6-8*01 L36092 F aatgctggtgtcactcagaccccaaaattccacatcctgaagacaggacagagcatgacactgcagtgtgcccaggatatgaaccatggatacatgtcctggtatcgacaagacccaggcatggggctgagactgatttactactcagctgctgctggtactactgacaaagaagtccccaatggctacaatgtctctagattaaacacagaggatttcccactcaggctggtgtcggctgctccctcccagacatctgtgtacttgtgtgccagcagttactc
83
+ Homosap TRBV6-9 TRBV6-9*01 X61447 F aatgctggtgtcactcagaccccaaaattccacatcctgaagacaggacagagcatgacactgcagtgtgcccaggatatgaaccatggatacttgtcctggtatcgacaagacccaggcatggggctgaggcgcattcattactcagttgctgctggtatcactgacaaaggagaagtccccgatggctacaatgtatccagatcaaacacagaggatttcccgctcaggctggagtcagctgctccctcccagacatctgtatacttctgtgccagcagttattc
84
+ Homosap TRBV7-2 TRBV7-2*01 X61442 F ggagctggagtctcccagtcccccagtaacaaggtcacagagaagggaaaggatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacagagcctggggcagggcctggagtttttaatttacttccaaggcaacagtgcaccagacaaatcagggctgcccagtgatcgcttctctgcagagaggactgggggatccgtctccactctgacgatccagcgcacacagcaggaggactcggccgtgtatctctgtgccagcagcttagc
85
+ Homosap TRBV7-2 TRBV7-2*02 L36190 F ggagctggagtctcccagtcccccagtaacaaggtcacagagaagggaaaggatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacagaggctggggcagggcctggagtttttaatttacttccaaggcaacagtgcaccagacaaatcagggctgcccagtgatcgcttctctgcagagaggactggggaatccgtctccactctgacgatccagcgcacacagcaggaggactcggccgtgtatctctgtgccagcagcttagc
86
+ Homosap TRBV7-2 TRBV7-2*03 U07975 F ggagctggagtctcccagtcccccagtaacaaggtcacagagaagggaaaggatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacagaggctggggcagggcctggagtttttaatttacttccaaggcaacagtgcaccagacaaatcagggctgcccagtgatcgcttctctgcagagaggactggggaatccgtctccactctgacgatccagcgcacacagcaggaggactcggccgtgtatctctgtaccagcagcttagc
87
+ Homosap TRBV7-2 TRBV7-2*04 M27387 (F) ggagctggagtttcccagtcccccagtaacaaggtcacagagaagggaaaggatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacagagcctggggcagggcctggagtttttaatttacttccaaggcaacagtgcaccagacaaatcagggctgcccagtgatcgcttctctgcagagaggactgggggatccgtctccactctgacgatccagcgcacacagcaggaggactcggccgtgtatctctgtgccagcagcttag
88
+ Homosap TRBV7-3 TRBV7-3*01 X61440 F ggtgctggagtctcccagacccccagtaacaaggtcacagagaagggaaaatatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacaaagcctggggcagggcccagagtttctaatttacttccaaggcacgggtgcggcagatgactcagggctgcccaacgatcggttctttgcagtcaggcctgagggatccgtctctactctgaagatccagcgcacagagcggggggactcagccgtgtatctctgtgccagcagcttaac
89
+ Homosap TRBV7-3 TRBV7-3*04 X74843 (F) ggtgctggagtctcccagacccccagtaacaaggtcacagagaagggaaaatatgtagagctcaggtgtgatccaatttcaggtcatactgccctttactggtaccgacaaagcctggggcagggcccagagtttctaatttacttccaaggcacgggtgcggcagatgactcagggctgcccaacgatcggttctttgcagtcaggcctgagggatccgtctctactctgaagatccagcgcacagagcggggggactctgccgtgtatctctgtgccagcagctt
90
+ Homosap TRBV7-4 TRBV7-4*01 L36092 F ggtgctggagtctcccagtccccaaggtacaaagtcgcaaagaggggacgggatgtagctctcaggtgtgattcaatttcgggtcatgtaaccctttattggtaccgacagaccctggggcagggctcagaggttctgacttactcccagagtgatgctcaacgagacaaatcagggcggcccagtggtcggttctctgcagagaggcctgagagatccgtctccactctgaagatccagcgcacagagcagggggactcagctgtgtatctctgtgccagcagcttagc
91
+ Homosap TRBV7-6 TRBV7-6*01 L36092 F ggtgctggagtctcccagtctcccaggtacaaagtcacaaagaggggacaggatgtagctctcaggtgtgatccaatttcgggtcatgtatccctttattggtaccgacaggccctggggcagggcccagagtttctgacttacttcaattatgaagcccaacaagacaaatcagggctgcccaatgatcggttctctgcagagaggcctgagggatccatctccactctgacgatccagcgcacagagcagcgggactcggccatgtatcgctgtgccagcagcttagc
92
+ Homosap TRBV7-6 TRBV7-6*02 X58806 (F) ggtgctggagtctcccagtctcccaggtacaaagtcacaaagaggggacaggatgtagctctcaggtgtgatccaatctcgggtcatgtatccctttattggtaccgacaggccctggggcagggcccagagtttctgacttacttcaattatgaagcccaacaagacaaatcagggctgcccaatgatcggttctctgcagagaggcctgagggatccatctccactctgacgatccagcgcacagagcagcgggactcggccatgtatcgctgtgccagcagc
93
+ Homosap TRBV7-7 TRBV7-7*01 L36092 F ggtgctggagtctcccagtctcccaggtacaaagtcacaaagaggggacaggatgtaactctcaggtgtgatccaatttcgagtcatgcaaccctttattggtatcaacaggccctggggcagggcccagagtttctgacttacttcaattatgaagctcaaccagacaaatcagggctgcccagtgatcggttctctgcagagaggcctgagggatccatctccactctgacgattcagcgcacagagcagcgggactcagccatgtatcgctgtgccagcagcttagc
94
+ Homosap TRBV7-7 TRBV7-7*02 X57607 (F) ggtgctggagtctcccagtctcccaggtacaaagtcacaaagaggggacaggatgtaactctcaggtgtgatccaatttcgagtcatgtaaccctttattggtatcaacaggccctggggcagggcccagagtttctgacttacttcaattatgaagctcaaccagacaaatcagggctgcccagtgatcggttctctgcagagaggcctgagggatccatctccactctgacgattcagcgcacagagcagcgggactcagccatgtatcgctgtgccagcagc
95
+ Homosap TRBV7-8 TRBV7-8*01 M11953 F ggtgctggagtctcccagtcccctaggtacaaagtcgcaaagagaggacaggatgtagctctcaggtgtgatccaatttcgggtcatgtatcccttttttggtaccaacaggccctggggcaggggccagagtttctgacttatttccagaatgaagctcaactagacaaatcggggctgcccagtgatcgcttctttgcagaaaggcctgagggatccgtctccactctgaagatccagcgcacacagcaggaggactccgccgtgtatctctgtgccagcagcttagc
96
+ Homosap TRBV7-8 TRBV7-8*02 X61441 F ggtgctggagtctcccagtcccctaggtacaaagtcgcaaagagaggacaggatgtagctctcaggtgtgatccaatttcgggtcatgtatcccttttttggtaccaacaggccctggggcaggggccagagtttctgacttatttccagaatgaagctcaactagacaaatcggggctgcccagtgatcgcttctttgcagaaaggcctgagggatccgtctccactctgaagatccagcgcacacagaaggaggactccgccgtgtatctctgtgccagcagcttagc
97
+ Homosap TRBV7-8 TRBV7-8*03 M27384 (F) ggtgctggagtctcccagtcccctaggtacaaagtcgcaaagagaggacaggatgtagctctcaggtgtgatccaatttcgggtcatgtatcccttttttggtaccaacaggccctcgggcaggggccagagtttctgacttatttccagaatgaagctcaactagacaaatcggggctgcccagtgatcgcttctttgcagaaaggcctgagggatccgtctccactctgaagatccagcgcacacagcaggaggactccgccgtgtatctctgtgccagcagccga
98
+ Homosap TRBV7-9 TRBV7-9*01 L36092 F gatactggagtctcccagaaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctttctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcagcttagc
99
+ Homosap TRBV7-9 TRBV7-9*02 M15564 (F) gatactggagtctcccagaaccccagacacaacatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctttctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcagcttagc
100
+ Homosap TRBV7-9 TRBV7-9*03 AF009663 F gatactggagtctcccaggaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctttctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcagcttagc
101
+ Homosap TRBV7-9 TRBV7-9*04 M14261 (F) atatctggagtctcccacaaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaaccctgggcagggcccagagtttctgacttacttccagaatgaagctcaactggaaaaatcagggctgctcagtgatcggatctctgcagagaggcctaagggatctttctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcagc
102
+ Homosap TRBV7-9 TRBV7-9*05 M27385 (F) gatactggagtctcccagaaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctctctccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcaccaaa
103
+ Homosap TRBV7-9 TRBV7-9*06 X74844 (F) gatactggagtctcccagaaccccagacacaagatcacaaagaggggacagaatgtaactttcaggtgtgatccaatttctgaacacaaccgcctttattggtaccgacagaccctggggcagggcccagagtttctgacttacttccagaatgaagctcaactagaaaaatcaaggctgctcagtgatcggttctctgcagagaggcctaagggatctctttccaccttggagatccagcgcacagagcagggggactcggccatgtatctctgtgccagcacgttg
104
+ Homosap TRBV9 TRBV9*01 L36092 F gattctggagtcacacaaaccccaaagcacctgatcacagcaactggacagcgagtgacgctgagatgctcccctaggtctggagacctctctgtgtactggtaccaacagagcctggaccagggcctccagttcctcattcagtattataatggagaagagagagcaaaaggaaacattcttgaacgattctccgcacaacagttccctgacttgcactctgaactaaacctgagctctctggagctgggggactcagctttgtatttctgtgccagcagcgtag
105
+ Homosap TRBV9 TRBV9*02 AF009660 F gattctggagtcacacaaaccccaaagcacctgatcacagcaactggacagcgagtgacgctgagatgctcccctaggtctggagacctctctgtgtactggtaccaacagagcctggaccagggcctccagttcctcattcactattataatggagaagagagagcaaaaggaaacattcttgaacgattctccgcacaacagttccctgacttgcactctgaactaaacctgagctctctggagctgggggactcagctttgtatttctgtgccagcagcgtag
106
+ Homosap TRBV9 TRBV9*03 M27380 (F) gattctggagtcacacaaaccccaaagcacctgatcacagcaactggacagcgagtgacgctgagatgctcccctaggtctggagacctctctgtgtactggtaccaacagagcctggaccagggcctccagttcctcattcaatattataatggagaagagagagcaaaaggaaacattcttgaacgattctccgcacaacagttccctgacttgcactctgaactaaacctgagctctctggagctgggggactcagctttgtatttctgtgccagcagc
src/main.py ADDED
@@ -0,0 +1,1423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import numpy as np
5
+ import pandas as pd
6
+ from typing import Dict, List, Tuple, Optional
7
+ from tqdm import tqdm
8
+ from collections import Counter
9
+ from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score
10
+ import warnings
11
+ from model import negative_sampling_phla
12
+ warnings.filterwarnings("ignore")
13
+
14
+ from physicochemical import PhysicochemicalEncoder
15
+
16
+ from model import (
17
+ ESM2Encoder,
18
+ ESMFoldEncoder,
19
+ PeptideHLABindingPredictor,
20
+ PepHLA_Dataset,
21
+ peptide_hla_collate_fn,
22
+ TCRPeptideHLABindingPredictor,
23
+ TCRPepHLA_Dataset,
24
+ tcr_pep_hla_collate_fn,
25
+ EarlyStopping
26
+ )
27
+
28
+ # ============================================================================
29
+ # Utility functions
30
+ # ============================================================================
31
+
32
+ def load_train_data(
33
+ df_train_list: List[pd.DataFrame],
34
+ df_val_list: List[pd.DataFrame],
35
+ hla_dict_path: str = 'pMHC/HLA_dict.npy',
36
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
37
+ """
38
+ Load training and validation datasets only.
39
+
40
+ Args:
41
+ hla_dict_path: Path to HLA dictionary
42
+ train_folds: List of training fold indices
43
+ val_folds: List of validation fold indices
44
+ sample_frac: Fraction of data to sample (for quick testing)
45
+ seed: Random seed
46
+
47
+ Returns:
48
+ df_train, df_val
49
+ """
50
+ print("Loading training and validation data...")
51
+
52
+ # Load HLA dictionary
53
+ HLA_dict = np.load(hla_dict_path, allow_pickle=True).item()
54
+
55
+ # Process HLA names → full sequence
56
+ for df in df_train_list + df_val_list:
57
+ df['HLA'] = df['HLA'].apply(lambda x: x[4:] if x.startswith('HLA-') else x)
58
+ df['HLA_full'] = df['HLA'].apply(lambda x: HLA_dict[x])
59
+
60
+ return df_train_list, df_val_list
61
+
62
+ def load_test_data(
63
+ df_test: pd.DataFrame,
64
+ hla_dict_path: str = 'pMHC/HLA_dict.npy'
65
+ ) -> pd.DataFrame:
66
+ """
67
+ Preprocess a given test DataFrame (e.g. independent or external set).
68
+
69
+ Args:
70
+ df_test: Test dataframe with at least 'HLA', 'peptide', 'label'
71
+ hla_dict_path: Path to HLA dictionary (to map HLA name to full sequence)
72
+
73
+ Returns:
74
+ Processed df_test with 'HLA_full' added
75
+ """
76
+ print("Processing test data...")
77
+
78
+ HLA_dict = np.load(hla_dict_path, allow_pickle=True).item()
79
+
80
+ df_test = df_test.copy()
81
+ df_test['HLA'] = df_test['HLA'].apply(lambda x: x[4:] if x.startswith('HLA-') else x)
82
+ df_test['HLA_full'] = df_test['HLA'].apply(lambda x: HLA_dict[x])
83
+
84
+ print(f"✓ Test set: {len(df_test)} samples")
85
+ return df_test
86
+
87
+ class StriMap_pHLA:
88
+ """
89
+ StriMap for Structure-informed Peptide-HLA Binding Prediction Model
90
+ """
91
+
92
+ def __init__(
93
+ self,
94
+ device: str = 'cuda:0',
95
+ model_save_path: str = 'model_params/best_model_phla.pt',
96
+ pep_dim: int = 256,
97
+ hla_dim: int = 256,
98
+ bilinear_dim: int = 256,
99
+ loss_fn: str = 'bce',
100
+ alpha: float = 0.5,
101
+ gamma: float = 2.0,
102
+ esm2_layer: int = 33,
103
+ batch_size: int = 256,
104
+ esmfold_cache_dir: str = "esm_cache",
105
+ cache_dir: str = 'phla_cache',
106
+ cache_save: bool = True,
107
+ seed: int = 1,
108
+ pos_weights: Optional[float] = None
109
+ ):
110
+ """
111
+ Initialize StriMap model
112
+
113
+ Args:
114
+ device: Device for computation
115
+ cache_dir: Directory for caching embeddings
116
+ model_save_path: Path to save best model
117
+ pep_dim: Peptide embedding dimension
118
+ hla_dim: HLA embedding dimension
119
+ bilinear_dim: Bilinear attention dimension
120
+ loss_fn: Loss function ('bce' or 'focal')
121
+ alpha: Alpha parameter for focal loss
122
+ gamma: Gamma parameter for focal loss
123
+ esm2_layer: ESM2 layer to extract features from
124
+ esmfold_cache_dir: Cache directory for ESMFold
125
+ cache_dir: Directory for caching embeddings
126
+ seed: Random seed
127
+ """
128
+ self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
129
+ self.model_save_path = model_save_path
130
+ if not os.path.exists(os.path.dirname(model_save_path)) and os.path.dirname(model_save_path) != '':
131
+ os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
132
+ self.seed = seed
133
+ self.cache_save = cache_save
134
+ self.batch_size = batch_size
135
+ self.loss_fn_name = loss_fn
136
+ self.alpha = alpha
137
+ self.gamma = gamma
138
+ self.pos_weights = pos_weights
139
+
140
+ # Set random seeds
141
+ self._set_seed(seed)
142
+
143
+ # Initialize encoders
144
+ print("Initializing encoders...")
145
+ self.phys_encoder = PhysicochemicalEncoder(device=self.device)
146
+ self.esm2_encoder = ESM2Encoder(device=str(self.device), layer=esm2_layer, cache_dir=cache_dir)
147
+ self.esmfold_encoder = ESMFoldEncoder(esm_cache_dir=esmfold_cache_dir, cache_dir=cache_dir)
148
+
149
+ # Initialize model
150
+ print("Initializing binding prediction model...")
151
+ self.model = PeptideHLABindingPredictor(
152
+ pep_dim=pep_dim,
153
+ hla_dim=hla_dim,
154
+ bilinear_dim=bilinear_dim,
155
+ loss_fn=self.loss_fn_name,
156
+ alpha=self.alpha,
157
+ gamma=self.gamma,
158
+ device=str(self.device),
159
+ pos_weights=self.pos_weights
160
+ ).to(self.device)
161
+
162
+ # Embeddings cache
163
+ self.phys_dict = None
164
+ self.esm2_dict = None
165
+ self.struct_dict = None
166
+
167
+ print(f"✓ StriMap initialized on {self.device}")
168
+
169
+ def _set_seed(self, seed: int):
170
+ """Set random seeds for reproducibility"""
171
+ np.random.seed(seed)
172
+ torch.manual_seed(seed)
173
+ torch.cuda.manual_seed(seed)
174
+ torch.cuda.manual_seed_all(seed)
175
+ torch.backends.cudnn.benchmark = False
176
+ torch.backends.cudnn.deterministic = True
177
+
178
+ def prepare_embeddings(
179
+ self,
180
+ df: pd.DataFrame,
181
+ force_recompute: bool = False,
182
+ ):
183
+ """
184
+ Prepare all embeddings (physicochemical, ESM2, structure)
185
+
186
+ Args:
187
+ df: DataFrame containing 'peptide' and 'HLA_full' columns
188
+ force_recompute: Force recomputation even if cache exists
189
+ incremental: If True, only compute missing sequences
190
+ phys_cache: Physicochemical embeddings cache file
191
+ esm2_cache: ESM2 embeddings cache file
192
+ struct_cache: Structure embeddings cache file
193
+ """
194
+
195
+ # Extract unique sequences
196
+ all_peptides = sorted(set(df['peptide'].astype(str)))
197
+ all_hlas = sorted(set(df['HLA_full'].astype(str)))
198
+
199
+ print(f"\n{'='*70}")
200
+ print(f"Preparing embeddings for:")
201
+ print(f" - {len(all_peptides)} unique peptides")
202
+ print(f" - {len(all_hlas)} unique HLAs")
203
+ print(f"{'='*70}\n")
204
+
205
+ # ========================================================================
206
+ # 1. Physicochemical features
207
+ # ========================================================================
208
+ self.phys_dict = {
209
+ 'pep': self._encode_phys(all_peptides),
210
+ 'hla': self._encode_phys(all_hlas)
211
+ }
212
+
213
+ # ========================================================================
214
+ # 2. ESM2 embeddings
215
+ # ========================================================================
216
+ self.esm2_dict = {
217
+ 'pep': self._encode_esm2(all_peptides, prefix='pep', re_embed=force_recompute),
218
+ 'hla': self._encode_esm2(all_hlas, prefix='hla', re_embed=force_recompute)
219
+ }
220
+
221
+ # ========================================================================
222
+ # 3. Structure features (only for HLA)
223
+ # ========================================================================
224
+ self.struct_dict = self._encode_structure(all_hlas)
225
+
226
+ # ========================================================================
227
+ # Summary
228
+ # ========================================================================
229
+ print(f"{'='*70}")
230
+ print("✓ All embeddings prepared!")
231
+ print(f" - Phys: {len(self.phys_dict['pep'])} peptides, {len(self.phys_dict['hla'])} HLAs")
232
+ print(f" - ESM2: {len(self.esm2_dict['pep'])} peptides, {len(self.esm2_dict['hla'])} HLAs")
233
+ print(f" - Struct: {len(self.struct_dict)} HLAs")
234
+ print(f"{'='*70}\n")
235
+
236
+ def _encode_phys(self,
237
+ sequences: List[str]) -> Dict[str, torch.Tensor]:
238
+ """Encode physicochemical properties"""
239
+ emb_dict = {}
240
+
241
+ for i in tqdm(range(0, len(sequences), self.batch_size), desc="Phys encoding"):
242
+ batch = sequences[i:i+self.batch_size]
243
+ embs = self.phys_encoder(batch).cpu() # [B, L, D]
244
+ for seq, emb in zip(batch, embs):
245
+ emb_dict[seq] = emb
246
+
247
+ return emb_dict
248
+
249
+ def _encode_esm2(self, sequences: List[str], prefix: str, re_embed: bool = False) -> Dict[str, torch.Tensor]:
250
+ """Encode with ESM2"""
251
+ df_tmp = pd.DataFrame({'seq': sequences})
252
+ emb_dict = self.esm2_encoder.forward(
253
+ df_tmp,
254
+ seq_col='seq',
255
+ prefix=prefix,
256
+ batch_size=self.batch_size,
257
+ re_embed=re_embed,
258
+ cache_save=self.cache_save
259
+ )
260
+ return emb_dict
261
+
262
+ def _encode_structure(self, sequences: List[str], re_embed: bool = False) -> Dict[str, Tuple]:
263
+ """Encode structure with ESMFold"""
264
+ feat_list, coor_list = self.esmfold_encoder.forward(
265
+ pd.DataFrame({'hla': sequences}),
266
+ 'hla',
267
+ device=str(self.device),
268
+ re_embed=re_embed,
269
+ )
270
+
271
+ struct_dict = {
272
+ seq: (feat, coor)
273
+ for seq, feat, coor in zip(sequences, feat_list, coor_list)
274
+ }
275
+ return struct_dict
276
+
277
+ def train(
278
+ self,
279
+ df_train: pd.DataFrame,
280
+ df_val: pd.DataFrame,
281
+ epochs: int = 100,
282
+ batch_size: int = 256,
283
+ lr: float = 1e-4,
284
+ patience: int = 5,
285
+ num_workers: int = 8,
286
+ fold_id: Optional[int] = None
287
+ ) -> Dict[str, List[float]]:
288
+ """
289
+ Train the model
290
+
291
+ Args:
292
+ df_train: Training data
293
+ df_val: Validation data
294
+ epochs: Number of epochs
295
+ batch_size: Batch size
296
+ lr: Learning rate
297
+ patience: Early stopping patience
298
+ num_workers: Number of data loading workers
299
+ fold_id: Fold identifier for saving (None for single model)
300
+
301
+ Returns:
302
+ Dictionary with training history
303
+ """
304
+ # Check if embeddings are prepared
305
+ if self.phys_dict is None or self.esm2_dict is None or self.struct_dict is None:
306
+ raise ValueError("Embeddings not prepared! Call prepare_embeddings() first.")
307
+
308
+ # Create datasets
309
+ print("Creating datasets...")
310
+ train_dataset = PepHLA_Dataset(df_train, self.phys_dict, self.esm2_dict, self.struct_dict)
311
+ val_dataset = PepHLA_Dataset(df_val, self.phys_dict, self.esm2_dict, self.struct_dict)
312
+
313
+ train_loader = torch.utils.data.DataLoader(
314
+ train_dataset,
315
+ batch_size=batch_size,
316
+ shuffle=True,
317
+ num_workers=num_workers,
318
+ collate_fn=peptide_hla_collate_fn,
319
+ pin_memory=True
320
+ )
321
+
322
+ val_loader = torch.utils.data.DataLoader(
323
+ val_dataset,
324
+ batch_size=batch_size,
325
+ shuffle=False,
326
+ num_workers=num_workers,
327
+ collate_fn=peptide_hla_collate_fn,
328
+ pin_memory=True
329
+ )
330
+
331
+ # Optimizer and early stopping
332
+ optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
333
+
334
+ # Model save path for this fold
335
+ save_path = self.model_save_path if fold_id is None else \
336
+ self.model_save_path.replace('.pt', f'_fold{fold_id}.pt')
337
+
338
+ early_stopping = EarlyStopping(
339
+ patience=patience,
340
+ save_path=save_path
341
+ )
342
+
343
+ # Training history
344
+ history = {
345
+ 'train_loss': [],
346
+ 'val_loss': [],
347
+ 'val_auc': [],
348
+ 'val_prc': []
349
+ }
350
+
351
+ fold_str = f"Fold {fold_id}" if fold_id is not None else "Single model"
352
+ print(f"\nStarting training for {epochs} epochs [{fold_str}]...")
353
+ print("=" * 70)
354
+
355
+ for epoch in range(epochs):
356
+ # Training
357
+ self.model.train()
358
+ train_loss = 0.0
359
+ train_batches = 0
360
+
361
+ train_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]", leave=False, ncols=80)
362
+
363
+ for batch in train_iter:
364
+ optimizer.zero_grad()
365
+ probs, loss, _, _ = self.model(batch)
366
+ loss.backward()
367
+ optimizer.step()
368
+ train_loss += loss.item()
369
+ train_batches += 1
370
+
371
+ train_loss /= train_batches
372
+
373
+ # Validation
374
+ self.model.eval()
375
+ val_loss = 0.0
376
+ val_preds = []
377
+ val_labels = []
378
+ val_batches = 0
379
+
380
+ with torch.no_grad():
381
+ val_iter = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]", leave=False, ncols=80)
382
+ for batch in val_iter:
383
+ probs, loss, _, _ = self.model(batch)
384
+ val_loss += loss.item()
385
+ val_batches += 1
386
+ val_preds.extend(probs)
387
+ val_labels.extend(batch['label'])
388
+
389
+ val_auc = roc_auc_score(val_labels, val_preds)
390
+ val_loss /= val_batches
391
+ val_prc = average_precision_score(val_labels, val_preds)
392
+
393
+ # Record history
394
+ history['train_loss'].append(train_loss)
395
+ history['val_loss'].append(val_loss)
396
+ history['val_auc'].append(val_auc)
397
+ history['val_prc'].append(val_prc)
398
+
399
+ # Print metrics
400
+ print(f"[{fold_str}] Epoch [{epoch+1}/{epochs}] | "
401
+ f"Train Loss: {train_loss:.4f} | "
402
+ f"Val Loss: {val_loss:.4f} | "
403
+ f"Val AUC: {val_auc:.4f} | "
404
+ f"Val PRC: {val_prc:.4f}")
405
+ # Early stopping
406
+ early_stopping(val_prc, self.model)
407
+
408
+ if early_stopping.early_stop:
409
+ print(f"\n[{fold_str}] Early stopping triggered at epoch {epoch+1}!")
410
+ break
411
+
412
+ # Load best model
413
+ print(f"\n[{fold_str}] Loading best model from {save_path}...")
414
+ self.model.load_state_dict(torch.load(save_path))
415
+
416
+ print("=" * 70)
417
+ print(f"✓ Training completed for {fold_str}!")
418
+
419
+ return history
420
+
421
+ def train_kfold(
422
+ self,
423
+ train_folds: List[Tuple[pd.DataFrame, pd.DataFrame]],
424
+ epochs: int = 100,
425
+ batch_size: int = 256,
426
+ lr: float = 1e-4,
427
+ patience: int = 5,
428
+ num_workers: int = 8
429
+ ) -> List[Dict[str, List[float]]]:
430
+ """
431
+ Train K-fold cross-validation models
432
+
433
+ Args:
434
+ train_folds: List of (train_df, val_df) tuples for each fold
435
+ epochs: Number of epochs per fold
436
+ batch_size: Batch size
437
+ lr: Learning rate
438
+ patience: Early stopping patience
439
+ num_workers: Number of data loading workers
440
+
441
+ Returns:
442
+ List of training histories for each fold
443
+ """
444
+ num_folds = len(train_folds)
445
+ all_histories = []
446
+
447
+ print("\n" + "=" * 70)
448
+ print(f"Starting {num_folds}-Fold Cross-Validation Training")
449
+ print("=" * 70)
450
+
451
+ for fold_id, (df_train, df_val) in enumerate(train_folds):
452
+ print(f"\n{'='*70}")
453
+ print(f"Training Fold {fold_id+1}/{num_folds}")
454
+ print(f"Train: {len(df_train)} samples | Val: {len(df_val)} samples")
455
+ print(f"{'='*70}")
456
+
457
+ self._set_seed(fold_id + self.seed) # Different seed for each fold
458
+
459
+ # Reinitialize model for this fold
460
+ self.model = PeptideHLABindingPredictor(
461
+ pep_dim=self.model.pep_dim,
462
+ hla_dim=self.model.hla_dim,
463
+ bilinear_dim=self.model.bilinear_dim,
464
+ loss_fn=self.loss_fn_name,
465
+ alpha=self.alpha,
466
+ gamma=self.gamma,
467
+ device=str(self.device),
468
+ pos_weights=self.pos_weights
469
+ ).to(self.device)
470
+
471
+ # Train this fold
472
+ history = self.train(
473
+ df_train,
474
+ df_val,
475
+ epochs=epochs,
476
+ batch_size=batch_size,
477
+ lr=lr,
478
+ patience=patience,
479
+ num_workers=num_workers,
480
+ fold_id=fold_id
481
+ )
482
+
483
+ all_histories.append(history)
484
+
485
+ print("\n" + "=" * 70)
486
+ print(f"✓ All {num_folds} folds training completed!")
487
+ print("=" * 70)
488
+
489
+ # Print summary
490
+ print("\nCross-Validation Summary:")
491
+ print("-" * 70)
492
+ for fold_id, history in enumerate(all_histories):
493
+ best_auc = max(history['val_auc'])
494
+ best_epoch = history['val_auc'].index(best_auc) + 1
495
+ print(f"Fold {fold_id}: Best Val AUC = {best_auc:.4f} (Epoch {best_epoch})")
496
+
497
+ mean_auc = np.mean([max(h['val_auc']) for h in all_histories])
498
+ std_auc = np.std([max(h['val_auc']) for h in all_histories])
499
+ print("-" * 70)
500
+ print(f"Mean Val AUC: {mean_auc:.4f} ± {std_auc:.4f}")
501
+ print("=" * 70 + "\n")
502
+
503
+ return all_histories
504
+
505
+ def predict(
506
+ self,
507
+ df: pd.DataFrame,
508
+ batch_size: int = 256,
509
+ return_probs: bool = True,
510
+ return_attn: bool = False,
511
+ use_kfold: bool = False,
512
+ num_folds: Optional[int] = None,
513
+ ensemble_method: str = 'mean',
514
+ num_workers: int = 8
515
+ ) -> np.ndarray:
516
+ """
517
+ Make predictions on a dataset
518
+
519
+ Args:
520
+ df: DataFrame with peptide and HLA_full columns
521
+ batch_size: Batch size for inference
522
+ return_probs: If True, return probabilities; else return binary predictions
523
+ use_kfold: If True, use ensemble of K models
524
+ num_folds: Number of folds (required if use_kfold=True)
525
+ ensemble_method: 'mean' or 'median' for ensemble
526
+
527
+ Returns:
528
+ Array of predictions
529
+ """
530
+ # Check if embeddings are prepared
531
+ if self.phys_dict is None or self.esm2_dict is None or self.struct_dict is None:
532
+ raise ValueError("Embeddings not prepared! Call prepare_embeddings() first.")
533
+
534
+ if use_kfold:
535
+ if num_folds is None:
536
+ raise ValueError("num_folds must be specified when use_kfold=True")
537
+
538
+ return self._predict_ensemble(
539
+ df,
540
+ batch_size,
541
+ num_folds,
542
+ ensemble_method,
543
+ return_probs,
544
+ return_attn,
545
+ num_workers
546
+ )
547
+ else:
548
+ # load single model
549
+ print(f"\nLoading model from {self.model_save_path} for prediction...")
550
+ self.model.load_state_dict(torch.load(self.model_save_path, map_location=self.device), strict=False)
551
+ # Single model prediction
552
+ return self._predict_single(df, batch_size, return_probs, return_attn, num_workers)
553
+
554
+ def _pad_attention(self, attns: List[np.ndarray]) -> np.ndarray:
555
+ """Pad attention maps to the same length"""
556
+ max_len = max(a.shape[1] for a in attns)
557
+ attns_padded = []
558
+ for a in attns:
559
+ padding = max_len - a.shape[1]
560
+ pad_width_3d = ((0, 0), # 不填充 H 维度
561
+ (0, padding), # 填充 Lv 维度的末尾
562
+ (0, 0)) # 不填充 Lq 维度
563
+
564
+ attns_padded.append(np.pad(a, pad_width_3d, mode='constant', constant_values=0.0))
565
+ return np.concatenate(attns_padded, axis=0)
566
+
567
+ def _predict_single(
568
+ self,
569
+ df: pd.DataFrame,
570
+ batch_size: int,
571
+ return_probs: bool,
572
+ return_attn: bool = False,
573
+ num_workers: int = 8
574
+ ) -> np.ndarray:
575
+ """Single model prediction"""
576
+ self.model.eval()
577
+
578
+ dataset = PepHLA_Dataset(df, self.phys_dict, self.esm2_dict, self.struct_dict)
579
+ loader = torch.utils.data.DataLoader(
580
+ dataset,
581
+ batch_size=batch_size,
582
+ shuffle=False,
583
+ num_workers=num_workers,
584
+ collate_fn=peptide_hla_collate_fn,
585
+ pin_memory=True
586
+ )
587
+
588
+ preds = []
589
+ attns = []
590
+ with torch.no_grad():
591
+ for batch in tqdm(loader, desc="Predicting"):
592
+ probs, loss, attn, _ = self.model(batch)
593
+ preds.extend(probs.tolist())
594
+ if return_attn:
595
+ attns.append(attn)
596
+
597
+ preds = np.array(preds)
598
+ if not return_probs:
599
+ preds = (preds >= 0.5).astype(int)
600
+
601
+ # padding attns to the same length
602
+ if not return_attn:
603
+ return preds, None
604
+ else:
605
+ return preds, self._pad_attention(attns)
606
+
607
+ def _predict_ensemble(
608
+ self,
609
+ df: pd.DataFrame,
610
+ batch_size: int,
611
+ num_folds: int,
612
+ ensemble_method: str,
613
+ return_probs: bool,
614
+ return_attn: bool = False,
615
+ num_workers: int = 8
616
+ ) -> np.ndarray:
617
+ """Ensemble prediction using K-fold models"""
618
+
619
+ print(f"\nEnsemble prediction using {num_folds} models...")
620
+ print(f"Ensemble method: {ensemble_method}")
621
+
622
+ all_preds = []
623
+ all_attns = []
624
+
625
+ for fold_id in range(num_folds):
626
+ # Load fold model
627
+ fold_model_path = self.model_save_path.replace('.pt', f'_fold{fold_id}.pt')
628
+
629
+ if not os.path.exists(fold_model_path):
630
+ print(f"⚠ Warning: {fold_model_path} not found, skipping...")
631
+ continue
632
+
633
+ print(f"Loading model from {fold_model_path}...")
634
+ self.model.load_state_dict(torch.load(fold_model_path, map_location=self.device), strict=False)
635
+
636
+ # Predict with this fold
637
+ if not return_attn:
638
+ fold_preds, _ = self._predict_single(df, batch_size, return_probs=True, num_workers=num_workers)
639
+ else:
640
+ fold_preds, attn_padded = self._predict_single(df, batch_size, return_probs=True, return_attn=True, num_workers=num_workers)
641
+ all_attns.append(attn_padded)
642
+
643
+ all_preds.append(fold_preds)
644
+
645
+ if len(all_preds) == 0:
646
+ raise ValueError("No fold models found!")
647
+
648
+ # Ensemble predictions
649
+ all_preds = np.array(all_preds) # [num_folds, num_samples]
650
+
651
+ if ensemble_method == 'mean':
652
+ ensemble_preds = np.mean(all_preds, axis=0)
653
+ elif ensemble_method == 'median':
654
+ ensemble_preds = np.median(all_preds, axis=0)
655
+ else:
656
+ raise ValueError(f"Unknown ensemble method: {ensemble_method}")
657
+
658
+ print(f"✓ Ensemble prediction completed using {len(all_preds)} models")
659
+
660
+ if not return_probs:
661
+ ensemble_preds = (ensemble_preds >= 0.5).astype(int)
662
+
663
+ if not return_attn:
664
+ return ensemble_preds, None
665
+ else:
666
+
667
+ # num_attn_each_fold = attns_padded.shape[0] // len(all_preds)
668
+ # # average attns across folds
669
+ # attns_padded = attns_padded.reshape(len(all_preds), num_attn_each_fold, attns_padded.shape[1], attns_padded.shape[2])
670
+ # attns_padded = np.mean(attns_padded, axis=1)
671
+ return ensemble_preds, self._pad_attention(all_attns)
672
+
673
+ def evaluate(
674
+ self,
675
+ df: pd.DataFrame,
676
+ batch_size: int = 256,
677
+ threshold: float = 0.5,
678
+ use_kfold: bool = False,
679
+ num_folds: Optional[int] = None,
680
+ ensemble_method: str = 'mean',
681
+ num_workers: int = 8
682
+ ) -> Dict[str, float]:
683
+ """
684
+ Evaluate model on a dataset
685
+
686
+ Args:
687
+ df: DataFrame with peptide, HLA_full, and label columns
688
+ batch_size: Batch size for inference
689
+ threshold: Classification threshold
690
+ use_kfold: If True, use ensemble of K models
691
+ num_folds: Number of folds (required if use_kfold=True)
692
+ ensemble_method: 'mean' or 'median' for ensemble
693
+
694
+ Returns:
695
+ Dictionary of metrics
696
+ """
697
+ y_true = df['label'].values
698
+ y_prob, _ = self.predict(
699
+ df,
700
+ batch_size=batch_size,
701
+ return_probs=True,
702
+ use_kfold=use_kfold,
703
+ num_folds=num_folds,
704
+ ensemble_method=ensemble_method,
705
+ num_workers=num_workers
706
+ )
707
+ y_pred = (y_prob >= threshold).astype(int)
708
+
709
+ # Calculate metrics
710
+ tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel().tolist()
711
+
712
+ accuracy = (tp + tn) / (tn + fp + fn + tp)
713
+
714
+ try:
715
+ mcc = ((tp*tn) - (fn*fp)) / np.sqrt(float((tp+fn)*(tn+fp)*(tp+fp)*(tn+fn)))
716
+ except:
717
+ mcc = 0.0
718
+
719
+ try:
720
+ recall = tp / (tp + fn)
721
+ except:
722
+ recall = 0.0
723
+
724
+ try:
725
+ precision = tp / (tp + fp)
726
+ except:
727
+ precision = 0.0
728
+
729
+ try:
730
+ f1 = 2 * precision * recall / (precision + recall)
731
+ except:
732
+ f1 = 0.0
733
+
734
+ try:
735
+ roc_auc = roc_auc_score(y_true, y_prob)
736
+ except:
737
+ roc_auc = 0.0
738
+
739
+ try:
740
+ # prc
741
+ from sklearn.metrics import average_precision_score
742
+ prc_auc = average_precision_score(y_true, y_prob)
743
+ except:
744
+ prc_auc = 0.0
745
+
746
+ # Print results
747
+ model_type = f"{num_folds}-Fold Ensemble ({ensemble_method})" if use_kfold else "Single Model"
748
+
749
+ print("\n" + "=" * 70)
750
+ print(f"Evaluation Results [{model_type}]")
751
+ print("=" * 70)
752
+ print(f"tn = {tn}, fp = {fp}, fn = {fn}, tp = {tp}")
753
+ print(f"y_pred: 0 = {Counter(y_pred)[0]} | 1 = {Counter(y_pred)[1]}")
754
+ print(f"y_true: 0 = {Counter(y_true)[0]} | 1 = {Counter(y_true)[1]}")
755
+ print(f"AUC: {roc_auc:.4f} | PRC: {prc_auc:.4f} | ACC: {accuracy:.4f} | MCC: {mcc:.4f} | F1: {f1:.4f}")
756
+ print(f"Precision: {precision:.4f} | Recall: {recall:.4f}")
757
+ print("=" * 70 + "\n")
758
+
759
+ return y_prob, {
760
+ 'auc': roc_auc,
761
+ 'prc': prc_auc,
762
+ 'accuracy': accuracy,
763
+ 'mcc': mcc,
764
+ 'f1': f1,
765
+ 'precision': precision,
766
+ 'recall': recall,
767
+ 'tn': tn,
768
+ 'fp': fp,
769
+ 'fn': fn,
770
+ 'tp': tp
771
+ }
772
+
773
+ def save_model(self, path: str):
774
+ """Save model weights"""
775
+ torch.save(self.model.state_dict(), path)
776
+ print(f"✓ Model saved to {path}")
777
+
778
+ def load_model(self, path: str):
779
+ """Load model weights"""
780
+ self.model.load_state_dict(torch.load(path, map_location=self.device), strict=False)
781
+ print(f"✓ Model loaded from {path}")
782
+
783
+ # ============================================================================
784
+
785
+ # -*- coding: utf-8 -*-
786
+ import os
787
+ import numpy as np
788
+ import pandas as pd
789
+ from collections import Counter
790
+ from tqdm import tqdm
791
+ import torch
792
+ from sklearn.metrics import roc_auc_score, confusion_matrix
793
+
794
+ class StriMap_TCRpHLA:
795
+ """
796
+ Structure-informed TCR(α/β)–peptide–HLA Binding Prediction
797
+ - Reuses encoders from StriMap_pHLA (phys, ESM2, ESMFold)
798
+ - Precomputes peptide–HLA features using pretrained StriMap_pHLA.model (PeptideHLABindingPredictor)
799
+ and injects them into batch during training/inference.
800
+ """
801
+
802
+ def __init__(
803
+ self,
804
+ pep_hla_system = None, # already-initialized and pretrained
805
+ device: str = 'cuda:0',
806
+ model_save_path: str = 'best_model_tcrpHLA.pt',
807
+ tcr_dim: int = 256,
808
+ pep_dim: int = 256,
809
+ hla_dim: int = 256,
810
+ bilinear_dim: int = 256,
811
+ loss_fn: str = 'bce',
812
+ alpha: float = 0.5,
813
+ gamma: float = 2.0,
814
+ resample_negatives: bool = False,
815
+ seed: int = 1,
816
+ pos_weights: Optional[float] = None
817
+ ):
818
+ self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
819
+ self.model_save_path = model_save_path
820
+ self.seed = seed
821
+ self.alpha = alpha
822
+ self.gamma = gamma
823
+ self.loss_fn_name = loss_fn
824
+ self.resample_negatives = resample_negatives
825
+ self.pos_weights = pos_weights
826
+
827
+ # seed
828
+ self._set_seed(seed)
829
+
830
+ if pep_hla_system is None:
831
+ raise ValueError("`pep_hla_system` must be provided — pass a trained StriMap_pHLA instance.")
832
+
833
+ # Reuse encoders from StriMap_pHLA
834
+ self.phys_encoder = pep_hla_system.phys_encoder
835
+ self.esm2_encoder = pep_hla_system.esm2_encoder
836
+ self.esmfold_encoder= pep_hla_system.esmfold_encoder
837
+ self.pep_hla_model = pep_hla_system.model # PeptideHLABindingPredictor with encode_peptide_hla()
838
+
839
+ # Initialize TCR–pHLA model
840
+ self.model = TCRPeptideHLABindingPredictor(
841
+ tcr_dim=tcr_dim,
842
+ pep_dim=pep_dim,
843
+ hla_dim=hla_dim,
844
+ bilinear_dim=bilinear_dim,
845
+ loss_fn=self.loss_fn_name,
846
+ alpha=self.alpha,
847
+ gamma=self.gamma,
848
+ pos_weights=self.pos_weights,
849
+ device=str(self.device),
850
+ ).to(self.device)
851
+
852
+ # Embedding caches
853
+ self.phys_dict = None
854
+ self.esm2_dict = None
855
+ self.struct_dict = None
856
+ self.pep_hla_feat_dict = {}
857
+
858
+ print(f"✓ StriMap_TCRpHLA initialized on {self.device}")
859
+
860
+ # -------------------- utils --------------------
861
+ def _set_seed(self, seed: int):
862
+ np.random.seed(seed)
863
+ torch.manual_seed(seed)
864
+ torch.cuda.manual_seed(seed)
865
+ torch.cuda.manual_seed_all(seed)
866
+ torch.backends.cudnn.benchmark = False
867
+ torch.backends.cudnn.deterministic = True
868
+
869
+ # -------------------- encoders --------------------
870
+ def _encode_phys(self, sequences):
871
+ emb_dict = {}
872
+ batch_size = 256
873
+ for i in tqdm(range(0, len(sequences), batch_size), desc="Phys encoding (TCRpHLA)"):
874
+ batch = sequences[i:i+batch_size]
875
+ embs = self.phys_encoder(batch).cpu() # [B, L, D]
876
+ for seq, emb in zip(batch, embs):
877
+ emb_dict[seq] = emb
878
+ return emb_dict
879
+
880
+ def save_model(self, path: str):
881
+ torch.save(self.model.state_dict(), path)
882
+ print(f"✓ Model saved to {path}")
883
+
884
+ def load_model(self, path: str):
885
+ """Load model weights"""
886
+ self.model.load_state_dict(torch.load(path, map_location=self.device))
887
+ print(f"✓ Model loaded from {path}")
888
+
889
+ def _encode_esm2(self, sequences, prefix: str, re_embed: bool=False):
890
+ df_tmp = pd.DataFrame({'seq': sequences})
891
+ return self.esm2_encoder.forward(
892
+ df_tmp, seq_col='seq', prefix=prefix, batch_size=128, re_embed=re_embed
893
+ )
894
+
895
+ def _encode_structure(self, sequences, prefix: str, re_embed: bool=False):
896
+ feat_list, coor_list = self.esmfold_encoder.forward(
897
+ pd.DataFrame({prefix: sequences}), prefix, device=str(self.device), re_embed=re_embed
898
+ )
899
+ return {seq: (feat, coor) for seq, feat, coor in zip(sequences, feat_list, coor_list)}
900
+
901
+ # -------------------- public: prepare embeddings --------------------
902
+ def prepare_embeddings(self, df: pd.DataFrame, force_recompute: bool=False):
903
+ """
904
+ Prepare per-residue encodings for TCRα, TCRβ, peptide, and HLA.
905
+ Peptide structure is computed via ESMFold as requested.
906
+ """
907
+ all_tcra = sorted(set(df['tcra'].astype(str)))
908
+ all_tcrb = sorted(set(df['tcrb'].astype(str)))
909
+ all_peps = sorted(set(df['peptide'].astype(str)))
910
+ all_hlas = sorted(set(df['HLA_full'].astype(str)))
911
+
912
+ self.max_pep_len = max(len(p) for p in all_peps)
913
+
914
+ print(f"\nPreparing embeddings:")
915
+ print(f" - TCRα: {len(all_tcra)} | TCRβ: {len(all_tcrb)} | peptides: {len(all_peps)} | HLAs: {len(all_hlas)}\n")
916
+
917
+ self.phys_dict = {
918
+ 'tcra': self._encode_phys(all_tcra),
919
+ 'tcrb': self._encode_phys(all_tcrb),
920
+ 'pep': self._encode_phys(all_peps),
921
+ 'hla': self._encode_phys(all_hlas)
922
+ }
923
+ self.esm2_dict = {
924
+ 'tcra': self._encode_esm2(all_tcra, prefix='tcra', re_embed=force_recompute),
925
+ 'tcrb': self._encode_esm2(all_tcrb, prefix='tcrb', re_embed=force_recompute),
926
+ 'pep': self._encode_esm2(all_peps, prefix='pep', re_embed=force_recompute),
927
+ 'hla': self._encode_esm2(all_hlas, prefix='hla', re_embed=force_recompute)
928
+ }
929
+
930
+ # Move everything in phys_dict and esm2_dict to CPU
931
+ for d in [self.phys_dict, self.esm2_dict]:
932
+ for k1 in d.keys(): # tcra / tcrb / pep / hla
933
+ for k2 in d[k1].keys(): # actual sequences
934
+ if torch.is_tensor(d[k1][k2]):
935
+ d[k1][k2] = d[k1][k2].cpu()
936
+
937
+ torch.cuda.empty_cache()
938
+
939
+ # IMPORTANT: include peptide structure via ESMFold
940
+ self.struct_dict = {
941
+ 'tcra': self._encode_structure(all_tcra, prefix='tcra', re_embed=force_recompute),
942
+ 'tcrb': self._encode_structure(all_tcrb, prefix='tcrb', re_embed=force_recompute),
943
+ 'pep': self._encode_structure(all_peps, prefix='pep', re_embed=force_recompute),
944
+ 'hla': self._encode_structure(all_hlas, prefix='hla', re_embed=force_recompute)
945
+ }
946
+
947
+ print("✓ Embeddings prepared for TCRα/β, peptide (with ESMFold), and HLA.")
948
+
949
+ # Move structure features to CPU
950
+ for part in ['tcra', 'tcrb', 'pep', 'hla']:
951
+ for seq, (feat, coord) in self.struct_dict[part].items():
952
+ self.struct_dict[part][seq] = (feat.cpu(), coord.cpu())
953
+
954
+ torch.cuda.empty_cache()
955
+ print("✓ All embeddings moved to CPU, GPU memory released.")
956
+
957
+ # -------------------- public: precompute pHLA features --------------------
958
+ def prepare_pep_hla_features(self, df: pd.DataFrame):
959
+ """
960
+ Precompute peptide-HLA features using pretrained PeptideHLABindingPredictor.
961
+ The resulting features are stored in self.pep_hla_feat_dict and later injected into each batch.
962
+ """
963
+ assert self.phys_dict is not None and self.esm2_dict is not None and self.struct_dict is not None, \
964
+ "Call prepare_embeddings() first."
965
+
966
+ pairs = {(row['peptide'], row['HLA_full']) for _, row in df.iterrows()}
967
+ self.pep_hla_model.eval()
968
+ for p in self.pep_hla_model.parameters():
969
+ p.requires_grad = False
970
+
971
+ print(f"\nPrecomputing peptide-HLA features for {len(pairs)} unique pairs...")
972
+ with torch.no_grad():
973
+ for pep, hla in tqdm(pairs, desc="pHLA features"):
974
+ pep_phys = self.phys_dict['pep'][pep].unsqueeze(0).to(self.device)
975
+ pep_esm = self.esm2_dict['pep'][pep].unsqueeze(0).to(self.device)
976
+ # If your PeptideHLABindingPredictor supports peptide structure, pass it too:
977
+ pep_struct, pep_coord = self.struct_dict['pep'][pep]
978
+ pep_struct = pep_struct.unsqueeze(0).to(self.device)
979
+ pep_coord = pep_coord.unsqueeze(0).to(self.device)
980
+
981
+ hla_phys = self.phys_dict['hla'][hla].unsqueeze(0).to(self.device)
982
+ hla_esm = self.esm2_dict['hla'][hla].unsqueeze(0).to(self.device)
983
+ hla_struct, hla_coord = self.struct_dict['hla'][hla]
984
+ hla_struct = hla_struct.unsqueeze(0).to(self.device)
985
+ hla_coord = hla_coord.unsqueeze(0).to(self.device)
986
+
987
+ # NOTE: encode_peptide_hla must accept (pep_struct, pep_coord) if you upgraded it;
988
+ # otherwise remove those two args.
989
+ pep_feat, hla_feat = self.pep_hla_model.encode_peptide_hla(
990
+ pep,
991
+ pep_phys, pep_esm,
992
+ hla_phys, hla_esm,
993
+ hla_struct, hla_coord,
994
+ max_pep_len=self.max_pep_len
995
+ )
996
+ self.pep_hla_feat_dict[(pep, hla)] = {
997
+ 'pep_feat_pretrain': pep_feat.squeeze(0).cpu(), # [Lp, pep_dim]
998
+ 'hla_feat_pretrain': hla_feat.squeeze(0).cpu() # [Lh, hla_dim]
999
+ }
1000
+ print("✓ Pretrained peptide-HLA features prepared.")
1001
+
1002
+ # -------------------- training --------------------
1003
+ def train(
1004
+ self,
1005
+ df_train: pd.DataFrame,
1006
+ df_val: Optional[pd.DataFrame] = None,
1007
+ df_test: Optional[pd.DataFrame] = None,
1008
+ df_neg: Optional[pd.DataFrame] = None,
1009
+ epochs: int = 100,
1010
+ batch_size: int = 128,
1011
+ lr: float = 1e-4,
1012
+ patience: int = 5,
1013
+ num_workers: int = 8,
1014
+ ):
1015
+ """
1016
+ Train the TCR-pHLA model.
1017
+
1018
+ Args:
1019
+ df_train: Training data.
1020
+ df_val: Optional validation data.
1021
+ df_test: Optional test data for evaluation after each epoch.
1022
+ df_neg: Optional negative samples for training. Set when resample_negatives=True.
1023
+ epochs: Number of epochs.
1024
+ batch_size: Batch size.
1025
+ lr: Learning rate.
1026
+ patience: Early stopping patience.
1027
+ num_workers: Data loading workers.
1028
+
1029
+ Returns:
1030
+ history: Dict containing training and validation metrics.
1031
+ """
1032
+
1033
+ # ---- Prepare embeddings ----
1034
+ print("Preparing peptide-HLA features...")
1035
+ all_dfs = [df for df in [df_train, df_val, df_test, df_neg] if df is not None]
1036
+ self.prepare_pep_hla_features(pd.concat(all_dfs, axis=0))
1037
+
1038
+ # ---- Validation loader (optional) ----
1039
+ if df_val is not None:
1040
+ val_ds = TCRPepHLA_Dataset(df_val, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
1041
+ val_loader = torch.utils.data.DataLoader(
1042
+ val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers,
1043
+ collate_fn=tcr_pep_hla_collate_fn, pin_memory=True
1044
+ )
1045
+ stopper = EarlyStopping(patience=patience, save_path=self.model_save_path)
1046
+ else:
1047
+ val_loader, stopper = None, None
1048
+
1049
+ # ---- Optimizer ----
1050
+ optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
1051
+
1052
+ # ---- Metric history ----
1053
+ history = {'train_loss': [], 'train_auc': []}
1054
+ if df_val is not None:
1055
+ history.update({'val_loss': [], 'val_auc': [], 'val_prc': []})
1056
+
1057
+ print("\nStart training TCR–pHLA model...")
1058
+ df_train_pos = df_train[df_train['label'] == 1].copy().reset_index(drop=True)
1059
+
1060
+ for epoch in range(epochs):
1061
+ # ---------- Training ----------
1062
+ if self.resample_negatives:
1063
+ df_train_neg = negative_sampling_phla(df_train_pos, random_state=epoch)
1064
+ if df_neg is not None:
1065
+ df_train_neg = pd.concat([df_train_neg, df_neg], axis=0).reset_index(drop=True)
1066
+ df_train_resample = pd.concat([df_train_pos, df_train_neg], axis=0).reset_index(drop=True)
1067
+ train_ds = TCRPepHLA_Dataset(df_train_resample, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
1068
+ else:
1069
+ train_ds = TCRPepHLA_Dataset(df_train, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
1070
+
1071
+ train_loader = torch.utils.data.DataLoader(
1072
+ train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers,
1073
+ collate_fn=tcr_pep_hla_collate_fn, pin_memory=True
1074
+ )
1075
+
1076
+ self.model.train()
1077
+ train_labels, train_preds = [], []
1078
+ epoch_loss = 0.0
1079
+
1080
+ for ibatch, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")):
1081
+ optimizer.zero_grad()
1082
+ probs, loss, _, _ = self.model(batch)
1083
+ loss.backward()
1084
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=2.0)
1085
+ optimizer.step()
1086
+
1087
+ epoch_loss += loss.item()
1088
+ train_labels.extend(batch['label'].cpu().numpy().tolist())
1089
+ train_preds.extend(probs.detach().cpu().numpy().tolist())
1090
+
1091
+ train_auc = roc_auc_score(train_labels, train_preds)
1092
+ train_loss = epoch_loss / (ibatch + 1)
1093
+ history['train_loss'].append(train_loss)
1094
+ history['train_auc'].append(train_auc)
1095
+ print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Train AUC: {train_auc:.4f}")
1096
+
1097
+ # ---------- Validation ----------
1098
+ if df_val is not None:
1099
+ self.model.eval()
1100
+ val_loss_sum, val_labels, val_preds = 0.0, [], []
1101
+ with torch.no_grad():
1102
+ for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]"):
1103
+ probs, loss, _, _ = self.model(batch)
1104
+ val_loss_sum += loss.item()
1105
+ val_labels.extend(batch['label'].cpu().numpy().tolist())
1106
+ val_preds.extend(probs.detach().cpu().numpy().tolist())
1107
+
1108
+ val_loss = val_loss_sum / len(val_loader)
1109
+ val_auc = roc_auc_score(val_labels, val_preds)
1110
+ val_prc = average_precision_score(val_labels, val_preds)
1111
+ history['val_loss'].append(val_loss)
1112
+ history['val_auc'].append(val_auc)
1113
+ history['val_prc'].append(val_prc)
1114
+ print(f"Epoch {epoch+1}/{epochs} | Val AUC: {val_auc:.4f} | Val PRC: {val_prc:.4f} | Val Loss: {val_loss:.4f}")
1115
+
1116
+ stopper(val_auc, self.model)
1117
+ if stopper.early_stop:
1118
+ print(f"Early stopping at epoch {epoch+1}")
1119
+ break
1120
+
1121
+ # ---------- Optional Test ----------
1122
+ if df_test is not None:
1123
+ test_ds = TCRPepHLA_Dataset(df_test, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
1124
+ test_loader = torch.utils.data.DataLoader(
1125
+ test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers,
1126
+ collate_fn=tcr_pep_hla_collate_fn, pin_memory=True
1127
+ )
1128
+ self.model.eval()
1129
+ test_labels, test_preds = [], []
1130
+ with torch.no_grad():
1131
+ for batch in tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} [Test]"):
1132
+ probs, _, _, _ = self.model(batch)
1133
+ test_labels.extend(batch['label'].cpu().numpy().tolist())
1134
+ test_preds.extend(probs.detach().cpu().numpy().tolist())
1135
+ test_auc = roc_auc_score(test_labels, test_preds)
1136
+ test_prc = average_precision_score(test_labels, test_preds)
1137
+ print(f"Epoch {epoch+1}/{epochs} | Test AUC: {test_auc:.4f} | Test PRC: {test_prc:.4f}")
1138
+
1139
+ # ---- Load best model only if validation used ----
1140
+ if df_val is not None and os.path.exists(self.model_save_path):
1141
+ self.model.load_state_dict(torch.load(self.model_save_path, map_location=self.device))
1142
+ print(f"✓ Training finished. Best model loaded from {self.model_save_path}")
1143
+ else:
1144
+ print("✓ Training finished (no validation set used).")
1145
+
1146
+ return history
1147
+
1148
+ def train_kfold(
1149
+ self,
1150
+ train_folds: List[Tuple[pd.DataFrame, pd.DataFrame]],
1151
+ df_test: Optional[pd.DataFrame] = None,
1152
+ df_neg: Optional[pd.DataFrame] = None,
1153
+ epochs: int = 100,
1154
+ batch_size: int = 128,
1155
+ lr: float = 1e-4,
1156
+ patience: int = 8,
1157
+ num_workers: int = 8,
1158
+ ) -> List[Dict[str, List[float]]]:
1159
+ """
1160
+ K-fold cross-validation training for TCR-pHLA model.
1161
+
1162
+ Args:
1163
+ train_folds: list of (train_df, val_df) tuples for each fold
1164
+ df_test: optional test data for evaluation after each epoch
1165
+ df_neg: optional negative samples for training. Set when resample_negatives=True.
1166
+ epochs: training epochs
1167
+ batch_size: batch size
1168
+ lr: learning rate
1169
+ patience: early stopping patience
1170
+ num_workers: dataloader workers
1171
+
1172
+ Returns:
1173
+ List of training histories for each fold
1174
+ """
1175
+ num_folds = len(train_folds)
1176
+ all_histories = []
1177
+
1178
+ print("\n" + "=" * 70)
1179
+ print(f"Starting {num_folds}-Fold Cross-Validation Training (TCR-pHLA)")
1180
+ print("=" * 70)
1181
+
1182
+ for fold_id, (df_train, df_val) in enumerate(train_folds):
1183
+ print(f"\n{'='*70}")
1184
+ print(f"Training Fold {fold_id+1}/{num_folds}")
1185
+ print(f"{'='*70}")
1186
+
1187
+ self._set_seed(self.seed + fold_id)
1188
+
1189
+ self.model = TCRPeptideHLABindingPredictor(
1190
+ tcr_dim=self.model.tcr_dim,
1191
+ pep_dim=self.model.pep_dim,
1192
+ hla_dim=self.model.hla_dim,
1193
+ bilinear_dim=self.model.bilinear_dim,
1194
+ loss_fn=self.loss_fn_name,
1195
+ alpha=self.alpha,
1196
+ gamma=self.gamma,
1197
+ pos_weights=self.pos_weights,
1198
+ device=str(self.device),
1199
+ ).to(self.device)
1200
+
1201
+ fold_save_path = self.model_save_path.replace(".pt", f"_fold{fold_id}.pt")
1202
+
1203
+ history = self.train(
1204
+ df_train=df_train,
1205
+ df_val=df_val,
1206
+ df_test=df_test,
1207
+ df_neg=df_neg,
1208
+ epochs=epochs,
1209
+ batch_size=batch_size,
1210
+ lr=lr,
1211
+ patience=patience,
1212
+ num_workers=num_workers,
1213
+ )
1214
+
1215
+ torch.save(self.model.state_dict(), fold_save_path)
1216
+ print(f"✓ Saved fold {fold_id} model to {fold_save_path}")
1217
+
1218
+ all_histories.append(history)
1219
+
1220
+ print("\n" + "=" * 70)
1221
+ print(f"✓ All {num_folds} folds training completed (TCR-pHLA)")
1222
+ print("=" * 70)
1223
+
1224
+ if df_val is not None:
1225
+ print("\nCross-Validation Summary:")
1226
+ print("-" * 70)
1227
+ for fold_id, hist in enumerate(all_histories):
1228
+ best_auc = max(hist['val_auc'])
1229
+ best_prc = max(hist['val_prc'])
1230
+ best_epoch = hist['val_auc'].index(best_auc) + 1
1231
+ print(f"Fold {fold_id}: Best Val AUC = {best_auc:.4f}, Best Val PRC = {best_prc:.4f}, (Epoch {best_epoch})")
1232
+
1233
+ mean_auc = np.mean([max(h['val_auc']) for h in all_histories])
1234
+ std_auc = np.std([max(h['val_auc']) for h in all_histories])
1235
+ print("-" * 70)
1236
+ print(f"Mean Val AUC: {mean_auc:.4f} ± {std_auc:.4f}")
1237
+ print("=" * 70 + "\n")
1238
+
1239
+ return all_histories
1240
+
1241
+ # -------------------- single-set predict --------------------
1242
+ def _predict_single(
1243
+ self, df: pd.DataFrame,
1244
+ batch_size: int = 128,
1245
+ return_probs: bool = True,
1246
+ num_workers: int = 8
1247
+ ):
1248
+ self.model.eval()
1249
+ ds = TCRPepHLA_Dataset(df, self.phys_dict, self.esm2_dict, self.struct_dict, self.pep_hla_feat_dict)
1250
+ loader = torch.utils.data.DataLoader(
1251
+ ds,
1252
+ batch_size=batch_size,
1253
+ shuffle=False,
1254
+ collate_fn=tcr_pep_hla_collate_fn,
1255
+ num_workers=num_workers,
1256
+ pin_memory=True
1257
+ )
1258
+
1259
+ preds = []
1260
+ pep_feat_all = []
1261
+ attn_all = []
1262
+ with torch.no_grad():
1263
+ for batch in tqdm(loader, desc="Predicting (TCR-pHLA)"):
1264
+ probs, _, pep_feature, attn_dict = self.model(batch)
1265
+ preds.extend(probs.tolist())
1266
+ pep_feat_all.append(pep_feature)
1267
+ attn_all.append(attn_dict)
1268
+
1269
+ preds = np.array(preds)
1270
+
1271
+ if not return_probs:
1272
+ preds = (preds >= 0.5).astype(int)
1273
+
1274
+ return preds, pep_feat_all, attn_all
1275
+
1276
+ # ================================================================
1277
+ # Ensemble prediction
1278
+ # ================================================================
1279
+ def _predict_ensemble(
1280
+ self,
1281
+ df: pd.DataFrame,
1282
+ batch_size: int = 128,
1283
+ num_folds: int = 5,
1284
+ ensemble_method: str = 'mean',
1285
+ return_probs: bool = True,
1286
+ num_workers: int = 8
1287
+ ) -> np.ndarray:
1288
+ """
1289
+ Ensemble prediction using multiple fold models.
1290
+ """
1291
+ print(f"\nEnsemble prediction using {num_folds} TCR–pHLA models...")
1292
+ print(f"Ensemble method: {ensemble_method}")
1293
+
1294
+ pep_feats_folds = []
1295
+ attn_dict_folds = []
1296
+ all_preds = []
1297
+ for fold_id in range(num_folds):
1298
+ fold_model_path = self.model_save_path.replace(".pt", f"_fold{fold_id}.pt")
1299
+ if not os.path.exists(fold_model_path):
1300
+ print(f"⚠ Warning: {fold_model_path} not found, skipping...")
1301
+ continue
1302
+
1303
+ print(f"Loading model from {fold_model_path}...")
1304
+ self.model.load_state_dict(torch.load(fold_model_path, map_location=self.device), strict=False)
1305
+
1306
+ # Predict for this fold
1307
+ fold_preds, fold_pep_feature, fold_attn_dict = self._predict_single(
1308
+ df, batch_size=batch_size, return_probs=True, num_workers=num_workers
1309
+ )
1310
+ all_preds.append(fold_preds)
1311
+ pep_feats_folds.append(fold_pep_feature)
1312
+ attn_dict_folds.append(fold_attn_dict)
1313
+
1314
+ if len(all_preds) == 0:
1315
+ raise ValueError("No fold models found!")
1316
+
1317
+ if ensemble_method == 'mean':
1318
+ ensemble_preds = np.mean(all_preds, axis=0)
1319
+ elif ensemble_method == 'median':
1320
+ ensemble_preds = np.median(all_preds, axis=0)
1321
+ else:
1322
+ raise ValueError(f"Unknown ensemble method: {ensemble_method}")
1323
+
1324
+ print(f"✓ Ensemble prediction completed using {len(all_preds)} folds")
1325
+
1326
+ if not return_probs:
1327
+ ensemble_preds = (ensemble_preds >= 0.5).astype(int)
1328
+
1329
+ return ensemble_preds, pep_feats_folds, attn_dict_folds
1330
+
1331
+
1332
+ # ================================================================
1333
+ # Unified predict() with ensemble support
1334
+ # ================================================================
1335
+ def predict(
1336
+ self,
1337
+ df: pd.DataFrame,
1338
+ batch_size: int = 128,
1339
+ return_probs: bool = True,
1340
+ use_kfold: bool = False,
1341
+ num_folds: Optional[int] = None,
1342
+ ensemble_method: str = 'mean',
1343
+ num_workers: int = 8
1344
+ ) -> Tuple[np.ndarray, List, List]:
1345
+ """
1346
+ Predict binding probabilities or binary labels.
1347
+
1348
+ If use_kfold=True, averages predictions across fold models.
1349
+ """
1350
+ print('Preparing peptide-HLA features for prediction set...')
1351
+ self.prepare_pep_hla_features(df)
1352
+
1353
+ if use_kfold:
1354
+ if num_folds is None:
1355
+ raise ValueError("num_folds must be specified when use_kfold=True")
1356
+ return self._predict_ensemble(
1357
+ df=df,
1358
+ batch_size=batch_size,
1359
+ num_folds=num_folds,
1360
+ ensemble_method=ensemble_method,
1361
+ return_probs=return_probs,
1362
+ num_workers=num_workers
1363
+ )
1364
+ else:
1365
+ return self._predict_single(df, batch_size=batch_size, return_probs=return_probs, num_workers=num_workers)
1366
+
1367
+
1368
+ # ================================================================
1369
+ # Unified evaluate() with ensemble support
1370
+ # ================================================================
1371
+ def evaluate(
1372
+ self,
1373
+ df: pd.DataFrame,
1374
+ batch_size: int = 128,
1375
+ threshold: float = 0.5,
1376
+ use_kfold: bool = False,
1377
+ num_folds: Optional[int] = None,
1378
+ ensemble_method: str = 'mean',
1379
+ num_workers: int = 8
1380
+ ) -> Dict[str, float]:
1381
+ """
1382
+ Evaluate model performance on a dataset.
1383
+
1384
+ If use_kfold=True, performs ensemble evaluation across folds.
1385
+ """
1386
+ y_true = df['label'].values
1387
+ y_prob, all_pep_features, merged_attn = self.predict(
1388
+ df,
1389
+ batch_size=batch_size,
1390
+ return_probs=True,
1391
+ use_kfold=use_kfold,
1392
+ num_folds=num_folds,
1393
+ ensemble_method=ensemble_method,
1394
+ num_workers=num_workers
1395
+ )
1396
+ y_pred = (y_prob >= threshold).astype(int)
1397
+
1398
+ tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel().tolist()
1399
+ accuracy = (tp + tn) / (tn + fp + fn + tp + 1e-9)
1400
+ try:
1401
+ mcc = ((tp*tn) - (fn*fp)) / np.sqrt(float((tp+fn)*(tn+fp)*(tp+fp)*(tn+fn)) + 1e-9)
1402
+ except:
1403
+ mcc = 0.0
1404
+ recall = tp / (tp + fn + 1e-9)
1405
+ precision = tp / (tp + fp + 1e-9)
1406
+ f1 = 2 * precision * recall / (precision + recall + 1e-9)
1407
+ try:
1408
+ auc = roc_auc_score(y_true, y_prob, max_fpr=0.1)
1409
+ except:
1410
+ auc = 0.0
1411
+
1412
+ print("\n" + "=" * 70)
1413
+ print(f"Evaluation Results [{'K-Fold Ensemble' if use_kfold else 'Single Model'}]")
1414
+ print("=" * 70)
1415
+ print(f"tn={tn}, fp={fp}, fn={fn}, tp={tp}")
1416
+ print(f"AUC={auc:.4f} | ACC={accuracy:.4f} | MCC={mcc:.4f} | F1={f1:.4f} | P={precision:.4f} | R={recall:.4f}")
1417
+ print("=" * 70 + "\n")
1418
+
1419
+ return dict(
1420
+ auc=auc, accuracy=accuracy, mcc=mcc, f1=f1,
1421
+ precision=precision, recall=recall,
1422
+ tn=tn, fp=fp, fn=fn, tp=tp
1423
+ )
src/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d6a02d441849cccafebf30918fd04d2379d8b8fe06b50de49fcabf8e86d77af
3
+ size 22006159
src/model.py ADDED
@@ -0,0 +1,1995 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import numpy as np
6
+ from typing import Dict, List, Tuple
7
+ from torch.nn.utils.parametrizations import weight_norm
8
+ from torch.nn import TransformerEncoder, TransformerEncoderLayer
9
+
10
+ import esm
11
+
12
+ import pandas as pd
13
+ from tqdm import tqdm
14
+ from typing import Dict, List, Tuple
15
+
16
+ import tempfile
17
+ from pathlib import Path
18
+ import mdtraj as md
19
+
20
+ # import io
21
+ # import gzip
22
+ import os
23
+
24
+ from egnn_pytorch import EGNN
25
+
26
+ from transformers import AutoTokenizer, EsmForProteinFolding
27
+
28
+ import logging
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # from re import search as re_search
33
+ import re
34
+
35
+
36
+ def determine_tcr_seq_vj(cdr3,V,J,chain,guess01=False):
37
+
38
+ def file2dict(filename,key_fields,store_fields,delimiter='\t'):
39
+ """Read file to a dictionary.
40
+ key_fields: fields to be used as keys
41
+ store_fields: fields to be saved as a list
42
+ delimiter: delimiter used in the given file."""
43
+ dictionary={}
44
+ with open(filename, newline='') as csvfile:
45
+ reader = csv.DictReader(csvfile,delimiter=delimiter)
46
+ for row in reader:
47
+ keys = [row[k] for k in key_fields]
48
+ store= [row[s] for s in store_fields]
49
+
50
+ sub_dict = dictionary
51
+ for key in keys[:-1]:
52
+ if key not in sub_dict:
53
+ sub_dict[key] = {}
54
+ sub_dict = sub_dict[key]
55
+ key = keys[-1]
56
+ if key not in sub_dict:
57
+ sub_dict[key] = []
58
+ sub_dict[key].append(store)
59
+ return dictionary
60
+
61
+ def get_protseqs_ntseqs(chain='B'):
62
+ """returns sequence dictioaries for genes: protseqsV, protseqsJ, nucseqsV, nucseqsJ"""
63
+ seq_dicts=[]
64
+ for gene,type in zip(['v','j','v','j'],['aa','aa','nt','nt']):
65
+ name = 'library/'+'tr'+chain.lower()+gene+'s_'+type+'.tsv'
66
+ sdict = file2dict(name,key_fields=['Allele'],store_fields=[type+'_seq'])
67
+ for g in sdict:
68
+ sdict[g]=sdict[g][0][0]
69
+ seq_dicts.append(sdict)
70
+ return seq_dicts
71
+
72
+ protVb,protJb,_,_ = get_protseqs_ntseqs(chain='B')
73
+ protVa,protJa,_,_ = get_protseqs_ntseqs(chain='A')
74
+
75
+ def splice_v_cdr3_j(pv: str, pj: str, cdr3: str) -> str:
76
+ """
77
+ pv: V gene protein sequence
78
+ pj: J gene protein sequence
79
+ cdr3: C-starting, F/W-ending CDR3 sequence (protein)
80
+ Returns: The spliced full sequence (V[:lastC] + CDR3 + J suffix)
81
+ """
82
+ pv = (pv or "").strip().upper()
83
+ pj = (pj or "").strip().upper()
84
+ cdr3 = (cdr3 or "").strip().upper()
85
+
86
+ # 1) V segment: Take the last 'C' (including the conserved C in V region)
87
+ cpos = pv.rfind('C')
88
+ if cpos == -1:
89
+ raise ValueError("V sequence has no 'C' to anchor CDR3 start.")
90
+ v_prefix = pv[:cpos] # up to and including C
91
+
92
+ # 2) Align CDR3's "end overlap" in J
93
+ # Start from the full length of cdr3, gradually shorten it, and find the longest suffix that can match in J
94
+ j_suffix = pj # fallback (in extreme cases)
95
+ for k in range(len(cdr3), 0, -1):
96
+ tail = cdr3[-k:] # CDR3's suffix
97
+ m = re.search(re.escape(tail), pj)
98
+ if m:
99
+ j_suffix = pj[m.end():] # Take the suffix from the matching segment
100
+ break
101
+
102
+ return v_prefix + cdr3 + j_suffix
103
+
104
+ tcr_list = []
105
+ for i in range(len(cdr3)):
106
+ cdr3_ = cdr3[i]
107
+ V_ = V[i]
108
+ J_ = J[i]
109
+ if chain=='A':
110
+ protseqsV = protVa
111
+ protseqsJ = protJa
112
+ else:
113
+ protseqsV = protVb
114
+ protseqsJ = protJb
115
+ if guess01:
116
+ if '*' not in V_:
117
+ V_+='*01'
118
+ if '*' not in J_:
119
+ J_+='*01'
120
+ pv = protseqsV[V_]
121
+ pj = protseqsJ[J_]
122
+ # t = pv[:pv.rfind('C')]+ cdr3_ + pj[re_search(r'[FW]G.[GV]',pj).start()+1:]
123
+ t = splice_v_cdr3_j(pv, pj, cdr3_)
124
+ tcr_list.append(t)
125
+ return tcr_list
126
+
127
+ # def negative_sampling_phla(df, neg_ratio=5, label_col='label', neg_label=0, random_state=42):
128
+ # """
129
+ # Create negative samples by shuffling the TCR sequences while keeping the peptide-HLA pairs intact.
130
+ # Ensures that the generated (TCR, peptide, HLA) triplets do not exist in the original dataset.
131
+ # """
132
+ # negative_samples = []
133
+
134
+ # # 正样本 triplet 集合
135
+ # pos_triplets = set(zip(
136
+ # df['tcra'], df['tcrb'], df['peptide'], df['HLA_full']
137
+ # ))
138
+
139
+ # for i in range(neg_ratio):
140
+ # shuffled_df = df.copy()
141
+
142
+ # tcr_cols = ['tcra', 'cdr3a_start', 'cdr3a_end', 'tcrb', 'cdr3b_start', 'cdr3b_end']
143
+ # shuffled_tcr = df[tcr_cols].sample(frac=1, random_state=random_state + i).reset_index(drop=True)
144
+
145
+ # for col in tcr_cols:
146
+ # shuffled_df[col] = shuffled_tcr[col]
147
+
148
+ # # 剔除:1) TCR 未改变的行 2) triplet 与正样本重复
149
+ # mask_keep = []
150
+ # for idx, row in shuffled_df.iterrows():
151
+ # triplet = (row['tcra'], row['tcrb'], row['peptide'], row['HLA_full'])
152
+ # if triplet in pos_triplets:
153
+ # mask_keep.append(False)
154
+ # else:
155
+ # mask_keep.append(True)
156
+
157
+ # shuffled_df = shuffled_df[mask_keep]
158
+ # shuffled_df[label_col] = neg_label
159
+
160
+ # negative_samples.append(shuffled_df)
161
+
162
+ # negative_samples = pd.concat(negative_samples, ignore_index=True).drop_duplicates()
163
+ # return negative_samples
164
+
165
+ import numpy as np
166
+ import pandas as pd
167
+
168
+ # def balanced_negative_sampling_phla(df, label_col='label', neg_label=0, random_state=42):
169
+ # """
170
+ # 为每个 (peptide, HLA_full) 平衡采样负样本:
171
+ # - 找出正样本最多的 peptide
172
+ # - 该 peptide 的负样本数量 = 1:1,从其他 peptide 的 TCR 中采样(保持 peptide–HLA 配对)
173
+ # - 其他 peptide 采样负样本,使每个 peptide 拥有相同总样本数
174
+ # - 保证 peptide 与 HLA_full 始终保持配对关系
175
+ # """
176
+ # np.random.seed(random_state)
177
+
178
+ # pos_df = df[df[label_col] != neg_label].copy()
179
+ # pos_counts = pos_df['peptide'].value_counts()
180
+ # max_peptide = pos_counts.idxmax()
181
+ # max_pos = pos_counts.max()
182
+ # total_target = max_pos * 2 # 每个 peptide 的最终样本数(正+负)
183
+
184
+ # neg_samples = []
185
+
186
+ # # 针对 max_peptide:负样本 = 1:1
187
+ # df_other_tcrs = pos_df[pos_df['peptide'] != max_peptide][['tcra', 'tcrb', 'cdr3a_start', 'cdr3a_end', 'cdr3b_start', 'cdr3b_end']].copy()
188
+ # neg_max = pos_df[pos_df['peptide'] == max_peptide].copy()
189
+ # sampled_tcrs = df_other_tcrs.sample(
190
+ # n=max_pos,
191
+ # replace=True if len(df_other_tcrs) < max_pos else False,
192
+ # random_state=random_state
193
+ # ).reset_index(drop=True)
194
+ # neg_max.update(sampled_tcrs)
195
+ # neg_max[label_col] = neg_label
196
+ # neg_samples.append(neg_max)
197
+
198
+ # # 针对其他 peptides
199
+ # for pep, n_pos in pos_counts.items():
200
+ # if pep == max_peptide:
201
+ # continue
202
+ # n_neg = max(0, total_target - n_pos)
203
+ # df_other_tcrs = pos_df[pos_df['peptide'] != pep][['tcra', 'tcrb', 'cdr3a_start', 'cdr3a_end', 'cdr3b_start', 'cdr3b_end']].copy()
204
+ # neg_pep = pos_df[pos_df['peptide'] == pep].copy()
205
+ # sampled_tcrs = df_other_tcrs.sample(
206
+ # n=min(len(df_other_tcrs), n_neg),
207
+ # replace=True if len(df_other_tcrs) < n_neg else False,
208
+ # random_state=random_state
209
+ # ).reset_index(drop=True)
210
+ # sampled_tcrs = sampled_tcrs.iloc[:len(neg_pep)].copy() if len(sampled_tcrs) > len(neg_pep) else sampled_tcrs
211
+ # neg_pep = pd.concat(
212
+ # [neg_pep]*int(np.ceil(n_neg / len(neg_pep))), ignore_index=True
213
+ # ).iloc[:n_neg]
214
+ # neg_pep.update(sampled_tcrs)
215
+ # neg_pep[label_col] = neg_label
216
+ # neg_samples.append(neg_pep)
217
+
218
+ # neg_df = pd.concat(neg_samples, ignore_index=True)
219
+ # final_df = pd.concat([pos_df, neg_df], ignore_index=True).reset_index(drop=True)
220
+
221
+ # return final_df
222
+
223
+ def negative_sampling_phla(df, neg_ratio=5, label_col='label', neg_label=0, random_state=42):
224
+ """
225
+ Create negative samples by shuffling TCRs while keeping peptide–HLA pairs intact.
226
+ Ensures negative samples count = neg_ratio × positive samples count.
227
+ """
228
+ np.random.seed(random_state)
229
+ pos_triplets = set(zip(df['tcra'], df['tcrb'], df['peptide'], df['HLA_full']))
230
+ tcr_cols = ['tcra', 'cdr3a_start', 'cdr3a_end', 'tcrb', 'cdr3b_start', 'cdr3b_end']
231
+
232
+ n_pos = len(df)
233
+ target_n_neg = n_pos * neg_ratio
234
+ all_neg = []
235
+
236
+ i = 0
237
+ while len(all_neg) < target_n_neg:
238
+ shuffled_df = df.copy()
239
+ shuffled_tcr = df[tcr_cols].sample(frac=1, random_state=random_state + i).reset_index(drop=True)
240
+ for col in tcr_cols:
241
+ shuffled_df[col] = shuffled_tcr[col]
242
+
243
+ mask_keep = []
244
+ for idx, row in shuffled_df.iterrows():
245
+ triplet = (row['tcra'], row['tcrb'], row['peptide'], row['HLA_full'])
246
+ mask_keep.append(triplet not in pos_triplets)
247
+ shuffled_df = shuffled_df[mask_keep]
248
+ shuffled_df[label_col] = neg_label
249
+
250
+ all_neg.append(shuffled_df)
251
+ i += 1
252
+
253
+ if len(pd.concat(all_neg)) > target_n_neg * 1.5:
254
+ break
255
+
256
+ negative_samples = pd.concat(all_neg, ignore_index=True).drop_duplicates()
257
+ negative_samples = negative_samples.sample(
258
+ n=min(len(negative_samples), target_n_neg), random_state=random_state
259
+ ).reset_index(drop=True)
260
+
261
+ return negative_samples
262
+
263
+ # def negative_sampling_tcr(df, neg_ratio=5, label_col='label', neg_label=0, random_state=42):
264
+ # """
265
+ # Create negative samples by keeping TCR fixed but assigning random (peptide, HLA_full)
266
+ # pairs that do not exist in the original dataset.
267
+ # Ensures that the generated (TCR, peptide, HLA) triplets do not exist in the original data.
268
+ # """
269
+ # np.random.seed(random_state)
270
+ # negative_samples = []
271
+
272
+ # pos_triplets = set(zip(df['tcra'], df['tcrb'], df['peptide'], df['HLA_full']))
273
+
274
+ # all_pairs = list(set(zip(df['peptide'], df['HLA_full'])))
275
+
276
+ # for i in range(neg_ratio):
277
+ # neg_df = df.copy()
278
+
279
+ # # 随机打乱 peptide–HLA 对,但保证不会选原来的那一个
280
+ # new_pairs = []
281
+ # for _, row in df.iterrows():
282
+ # while True:
283
+ # pep, hla = all_pairs[np.random.randint(len(all_pairs))]
284
+ # triplet = (row['tcra'], row['tcrb'], pep, hla)
285
+ # if triplet not in pos_triplets:
286
+ # new_pairs.append((pep, hla))
287
+ # break
288
+
289
+ # neg_df[['peptide', 'HLA_full']] = pd.DataFrame(new_pairs, index=neg_df.index)
290
+ # neg_df[label_col] = neg_label
291
+ # negative_samples.append(neg_df)
292
+
293
+ # negative_samples = pd.concat(negative_samples, ignore_index=True).drop_duplicates()
294
+ # return negative_samples
295
+
296
+ class EarlyStopping:
297
+ def __init__(self, patience=10, verbose=True, delta=0.0, save_path='checkpoint.pt'):
298
+ """
299
+ Early stopping based on both val_loss and val_auc.
300
+ The model is saved whenever EITHER:
301
+ - val_loss decreases by more than delta, OR
302
+ - val_auc increases by more than delta.
303
+ """
304
+ self.patience = patience
305
+ self.verbose = verbose
306
+ self.counter = 0
307
+ self.early_stop = False
308
+ self.delta = delta
309
+ self.save_path = save_path
310
+
311
+ self.best_loss = np.inf
312
+ self.best_auc = -np.inf
313
+
314
+ def __call__(self, val_auc, model):
315
+ improved = False
316
+
317
+ # Check auc improvement
318
+ if val_auc > self.best_auc + self.delta:
319
+ self.best_auc = val_auc
320
+ improved = True
321
+
322
+ if improved:
323
+ self.save_checkpoint(model, val_auc)
324
+ self.counter = 0
325
+ else:
326
+ self.counter += 1
327
+ if self.verbose:
328
+ print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
329
+ if self.counter >= self.patience:
330
+ self.early_stop = True
331
+
332
+ def save_checkpoint(self, model, val_auc):
333
+ """Save current best model."""
334
+ if self.verbose:
335
+ print(f"Validation improved → Saving model (Score={val_auc:.4f}) to {self.save_path}")
336
+ torch.save(model.state_dict(), self.save_path)
337
+
338
+ # ============================================================================
339
+ # ESM2 Embedding via HuggingFace
340
+ # ============================================================================
341
+ class ESM2Encoder(nn.Module):
342
+ def __init__(self,
343
+ device="cuda:0",
344
+ layer=33,
345
+ cache_dir='cache'):
346
+ """
347
+ Initialize an ESM2 encoder.
348
+
349
+ Args:
350
+ model_name (str): Name of the pretrained ESM2 model (e.g., 'esm2_t33_650M_UR50D').
351
+ device (str): Device to run on, e.g. 'cuda:0', 'cuda:1', or 'cpu'.
352
+ layer (int): Layer number from which to extract representations.
353
+ """
354
+ super().__init__()
355
+ self.device = device
356
+ self.layer = layer
357
+
358
+ if cache_dir is None:
359
+ cache_dir = os.path.dirname(os.path.abspath(__file__))
360
+ self.cache_dir = cache_dir
361
+ os.makedirs(self.cache_dir, exist_ok=True)
362
+
363
+ self.model, self.alphabet = esm.pretrained.esm2_t33_650M_UR50D()
364
+ self.batch_converter = self.alphabet.get_batch_converter()
365
+ self.model = self.model.eval().to(device)
366
+
367
+ def _cache_path(self, prefix):
368
+ base_dir = os.path.dirname(os.path.abspath(__file__))
369
+ base_dir = base_dir + "/" + self.cache_dir
370
+ os.makedirs(base_dir, exist_ok=True)
371
+ return os.path.join(base_dir, f"{prefix}_esm2_layer{self.layer}.pt")
372
+
373
+ def save_obj(self, obj, path):
374
+ """Save object to a file (no compression)."""
375
+ torch.save(obj, path)
376
+
377
+ def load_obj(self, path):
378
+ """Load object from a file (no compression)."""
379
+ return torch.load(path, map_location="cpu", weights_only=False)
380
+
381
+ @torch.no_grad()
382
+ def _embed_batch(self, batch_data):
383
+ batch_labels, batch_strs, batch_tokens = self.batch_converter(batch_data)
384
+ batch_tokens = batch_tokens.to(self.device)
385
+ results = self.model(batch_tokens, repr_layers=[self.layer], return_contacts=False)
386
+ token_representations = results["representations"][self.layer]
387
+ batch_lens = (batch_tokens != self.alphabet.padding_idx).sum(1)
388
+ seq_reprs = []
389
+ for i, tokens_len in enumerate(batch_lens):
390
+ seq_repr = token_representations[i, 1:tokens_len-1].cpu()
391
+ seq_reprs.append(seq_repr)
392
+ return seq_reprs
393
+
394
+ @torch.no_grad()
395
+ def forward(self, df, seq_col, prefix, batch_size=64, re_embed=False, cache_save=True):
396
+ """
397
+ Add or update embeddings for sequences in a DataFrame.
398
+ - If there are new sequences, automatically update the dictionary and save.
399
+ - If re_embed=True, force re-computation of all sequences.
400
+ """
401
+ cache_path = self._cache_path(prefix)
402
+ emb_dict = {}
403
+
404
+ if os.path.exists(cache_path) and not re_embed:
405
+ print(f"[ESM2] Loading cached embeddings from {cache_path}")
406
+ emb_dict = self.load_obj(cache_path)
407
+ else:
408
+ if re_embed:
409
+ print(f"[ESM2] Re-embedding all sequences for {prefix}")
410
+ else:
411
+ print(f"[ESM2] No existing cache for {prefix}, will create new.")
412
+
413
+ seqs = [str(s).strip().upper() for s in df[seq_col].tolist() if isinstance(s, str)]
414
+ unique_seqs = sorted(set(seqs))
415
+ new_seqs = [s for s in unique_seqs if s not in emb_dict]
416
+
417
+ if new_seqs:
418
+ print(f"[ESM2] Found {len(new_seqs)} new sequences → computing embeddings...")
419
+ data = [(str(i), s) for i, s in enumerate(new_seqs)]
420
+ for i in tqdm(range(0, len(data), batch_size), desc=f"ESM2 update ({prefix})"):
421
+ batch = data[i:i+batch_size]
422
+ embs = self._embed_batch(batch)
423
+ for (_, seq), emb in zip(batch, embs):
424
+ emb_dict[seq] = emb.clone()
425
+ if cache_save:
426
+ print(f"[ESM2] Updating cache with new sequences")
427
+ self.save_obj(emb_dict, cache_path)
428
+ else:
429
+ print(f"[ESM2] No new sequences for {prefix}, using existing cache")
430
+
431
+ return emb_dict
432
+
433
+ # ============================================================================
434
+ # ESMFold (transformers)
435
+ # ============================================================================
436
+ class ESMFoldPredictorHF(nn.Module):
437
+ def __init__(self,
438
+ model_name="facebook/esmfold_v1",
439
+ cache_dir=None,
440
+ device='cpu',
441
+ allow_tf32=True):
442
+ super().__init__()
443
+ self.model_name = model_name
444
+ self.cache_dir = cache_dir
445
+ self.device = device
446
+ if allow_tf32:
447
+ torch.backends.cuda.matmul.allow_tf32 = True
448
+ torch.backends.cudnn.allow_tf32 = True
449
+
450
+ # tokenizer and model
451
+ print(f"Loading ESMFold model {model_name} on {device}... {'with' if cache_dir else 'without'} cache_dir: {cache_dir}")
452
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
453
+ self.model = EsmForProteinFolding.from_pretrained(
454
+ model_name, low_cpu_mem_usage=True, cache_dir=cache_dir
455
+ ).eval().to(self.device)
456
+
457
+ @torch.no_grad()
458
+ def infer_pdb_str(self, seq: str) -> str:
459
+ pdb_str = self.model.infer_pdb(seq)
460
+ return pdb_str
461
+
462
+ @torch.no_grad()
463
+ def forward_raw(self, seq: str):
464
+ inputs = self.tokenizer([seq], return_tensors="pt", add_special_tokens=False)
465
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
466
+ outputs = self.model(**inputs)
467
+ return outputs # ESMFoldOutput
468
+
469
+ MAX_ASA_TIEN = {
470
+ "ALA": 129.0, "ARG": 274.0, "ASN": 195.0, "ASP": 193.0, "CYS": 167.0,
471
+ "GLN": 225.0, "GLU": 223.0, "GLY": 104.0, "HIS": 224.0, "ILE": 197.0,
472
+ "LEU": 201.0, "LYS": 236.0, "MET": 224.0, "PHE": 240.0, "PRO": 159.0,
473
+ "SER": 155.0, "THR": 172.0, "TRP": 285.0, "TYR": 263.0, "VAL": 174.0,
474
+ }
475
+ SS8_INDEX = {"H":0,"B":1,"E":2,"G":3,"I":4,"T":5,"S":6,"C":7,"-":7}
476
+
477
+ class StructureFeatureExtractorNoDSSP(nn.Module):
478
+ def __init__(self, device="cpu"):
479
+ super().__init__()
480
+ self.device = device
481
+
482
+ self.in_dim = 6 + 8 + 1 + 1 + 1 # 17
483
+
484
+ self.to(torch.device(self.device))
485
+
486
+ @torch.no_grad()
487
+ def _angles(self, traj):
488
+
489
+ L = traj.n_residues
490
+
491
+ sphi = np.zeros(L, dtype=np.float32); cphi = np.zeros(L, dtype=np.float32)
492
+ spsi = np.zeros(L, dtype=np.float32); cpsi = np.zeros(L, dtype=np.float32)
493
+ someg = np.zeros(L, dtype=np.float32); comeg = np.zeros(L, dtype=np.float32)
494
+
495
+ # 1) phi: (C_{i-1}, N_i, CA_i, C_i) —— 当前残基 i 可用 atoms[1] (N_i) 来定位
496
+ phi_idx, phi_vals = md.compute_phi(traj) # phi_vals: (1, n_phi)
497
+ if phi_vals.size > 0:
498
+ for k, atoms in enumerate(phi_idx):
499
+ res_i = traj.topology.atom(int(atoms[1])).residue.index # N_i 所在残基
500
+ if 0 <= res_i < L:
501
+ ang = float(phi_vals[0, k])
502
+ sphi[res_i] = np.sin(ang); cphi[res_i] = np.cos(ang)
503
+
504
+ # 2) psi: (N_i, CA_i, C_i, N_{i+1}) —— 当前残基 i 可用 atoms[1] (CA_i)
505
+ psi_idx, psi_vals = md.compute_psi(traj)
506
+ if psi_vals.size > 0:
507
+ for k, atoms in enumerate(psi_idx):
508
+ res_i = traj.topology.atom(int(atoms[1])).residue.index # CA_i
509
+ if 0 <= res_i < L:
510
+ ang = float(psi_vals[0, k])
511
+ spsi[res_i] = np.sin(ang); cpsi[res_i] = np.cos(ang)
512
+
513
+ # 3) omega: (CA_i, C_i, N_{i+1}, CA_{i+1}) —— 当前残基 i 可用 atoms[0] (CA_i)
514
+ omg_idx, omg_vals = md.compute_omega(traj)
515
+ if omg_vals.size > 0:
516
+ for k, atoms in enumerate(omg_idx):
517
+ res_i = traj.topology.atom(int(atoms[0])).residue.index # CA_i
518
+ if 0 <= res_i < L:
519
+ ang = float(omg_vals[0, k])
520
+ someg[res_i] = np.sin(ang); comeg[res_i] = np.cos(ang)
521
+
522
+ angles_feat = np.stack([sphi, cphi, spsi, cpsi, someg, comeg], axis=-1) # [L, 6]
523
+ return angles_feat.astype(np.float32)
524
+
525
+ @torch.no_grad()
526
+ def _ss8(self, traj: md.Trajectory):
527
+ ss = md.compute_dssp(traj, simplified=False)[0]
528
+ L = traj.n_residues
529
+ onehot = np.zeros((L, 8), dtype=np.float32)
530
+ for i, ch in enumerate(ss):
531
+ onehot[i, SS8_INDEX.get(ch, 7)] = 1.0
532
+ return onehot
533
+
534
+ @torch.no_grad()
535
+ def _rsa(self, traj: md.Trajectory):
536
+ asa = md.shrake_rupley(traj, mode="residue")[0] # (L,)
537
+ rsa = np.zeros_like(asa, dtype=np.float32)
538
+ for i, res in enumerate(traj.topology.residues):
539
+ max_asa = MAX_ASA_TIEN.get(res.name.upper(), None)
540
+ rsa[i] = 0.0 if not max_asa else float(asa[i] / max_asa)
541
+ return np.clip(rsa, 0.0, 1.0)[:, None]
542
+
543
+ @torch.no_grad()
544
+ def _contact_count(self, traj: md.Trajectory, cutoff_nm=0.8):
545
+ L = traj.n_residues
546
+ ca_atoms = traj.topology.select("name CA")
547
+ if len(ca_atoms) == L:
548
+ coors = traj.xyz[0, ca_atoms, :] # nm
549
+ else:
550
+ xyz = traj.xyz[0]
551
+ coors = []
552
+ for res in traj.topology.residues:
553
+ idxs = [a.index for a in res.atoms]
554
+ coors.append(xyz[idxs, :].mean(axis=0))
555
+ coors = np.array(coors, dtype=np.float32)
556
+ diff = coors[:, None, :] - coors[None, :, :]
557
+ dist = np.sqrt((diff**2).sum(-1)) # nm
558
+ mask = (dist < cutoff_nm).astype(np.float32)
559
+ np.fill_diagonal(mask, 0.0)
560
+ cnt = mask.sum(axis=1)
561
+ return cnt[:, None].astype(np.float32)
562
+
563
+ @torch.no_grad()
564
+ def _plddt(self, pdb_file: str):
565
+ # 用 Biopython 读取 PDB 的 B-factor(ESMFold/AlphaFold 会把 pLDDT 写在这里)
566
+ from Bio.PDB import PDBParser
567
+ import numpy as np
568
+
569
+ parser = PDBParser(QUIET=True)
570
+ structure = parser.get_structure("prot", pdb_file)
571
+ model = structure[0]
572
+
573
+ res_plddt = []
574
+ for chain in model:
575
+ for residue in chain:
576
+ atoms = list(residue.get_atoms())
577
+ if len(atoms) == 0:
578
+ res_plddt.append(0.0)
579
+ continue
580
+ # 该残基原子 B-factor 的均值
581
+ bvals = [float(atom.get_bfactor()) for atom in atoms]
582
+ res_plddt.append(float(np.mean(bvals)))
583
+
584
+ # 归一化到 [0,1]
585
+ plddt = np.array(res_plddt, dtype=np.float32) / 100.0
586
+ plddt = np.clip(plddt, 0.0, 1.0)
587
+ return plddt[:, None] # [L,1]
588
+
589
+ @torch.no_grad()
590
+ def _parse_and_features(self, pdb_file: str):
591
+ traj = md.load(pdb_file)
592
+ L = traj.n_residues
593
+
594
+ angles = self._angles(traj) # [L,6]
595
+ ss8 = self._ss8(traj) # [L,8]
596
+ rsa = self._rsa(traj) # [L,1]
597
+ cnt = self._contact_count(traj) # [L,1]
598
+ plddt = self._plddt(pdb_file) # [L,1]
599
+
600
+ feats = np.concatenate([angles, ss8, rsa, cnt, plddt], axis=1).astype(np.float32) # [L,17]
601
+
602
+ ca_atoms = traj.topology.select("name CA")
603
+ if len(ca_atoms) == L:
604
+ coors_nm = traj.xyz[0, ca_atoms, :]
605
+ else:
606
+ xyz = traj.xyz[0]
607
+ res_coords = []
608
+ for res in traj.topology.residues:
609
+ idxs = [a.index for a in res.atoms]
610
+ res_coords.append(xyz[idxs, :].mean(axis=0))
611
+ coors_nm = np.array(res_coords, dtype=np.float32)
612
+ coors_ang = coors_nm * 10.0 # nm -> Å
613
+ return coors_ang.astype(np.float32), feats # [L,3], [L,17]
614
+
615
+ @torch.no_grad()
616
+ def forward(self, pdb_file: str):
617
+ coors_ang, scalars = self._parse_and_features(pdb_file)
618
+ coors = torch.tensor(coors_ang, dtype=torch.float32, device=self.device) # [N,3]
619
+ scalars = torch.tensor(scalars, dtype=torch.float32, device=self.device) # [N,17]
620
+
621
+ return scalars, coors # [N,17], [N,3]
622
+
623
+ class ResiduePipelineWithHFESM:
624
+ def __init__(self,
625
+ esm_model_name="facebook/esmfold_v1",
626
+ cache_dir=None,
627
+ esm_device='cpu',
628
+ allow_tf32=True
629
+ ):
630
+ self.esm = ESMFoldPredictorHF(esm_model_name, cache_dir, esm_device, allow_tf32)
631
+ self.struct_encoder = StructureFeatureExtractorNoDSSP(device=esm_device)
632
+ self.cache_dir = cache_dir
633
+
634
+ @torch.no_grad()
635
+ def __call__(self, seq: str, save_pdb_path: str = None) -> torch.Tensor:
636
+ pdb_str = self.esm.infer_pdb_str(seq)
637
+ if save_pdb_path is None:
638
+ tmpdir = self.cache_dir if self.cache_dir is not None else tempfile.gettempdir()
639
+ save_pdb_path = str(Path(tmpdir) / "esmfold_pred_fold5.pdb")
640
+ Path(save_pdb_path).write_text(pdb_str)
641
+
642
+ struct_emb, struct_coords = self.struct_encoder(save_pdb_path)
643
+ return struct_emb, struct_coords
644
+
645
+ def sanitize_protein_seq(seq: str) -> str:
646
+ if not isinstance(seq, str):
647
+ return ""
648
+ s = "".join(seq.split()).upper()
649
+ allowed = set("ACDEFGHIKLMNPQRSTVWYXBZJUO")
650
+ return "".join([c for c in s if c in allowed])
651
+
652
+ @torch.no_grad()
653
+ def batch_embed_to_dicts(
654
+ df: pd.DataFrame,
655
+ seq_col: str,
656
+ pipeline,
657
+ show_progress: bool = True,
658
+ ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], List[Tuple[str, str]]]:
659
+ """
660
+ Returns:
661
+ - emb_dict: {seq -> z(torch.Tensor[L, D])}
662
+ - coord_dict:{seq -> coords(torch.Tensor[L, 3])}
663
+ - failures: [(seq, err_msg), ...]
664
+ """
665
+
666
+ raw_list = df[seq_col].astype(str).tolist()
667
+ seqs = []
668
+ for s in raw_list:
669
+ ss = sanitize_protein_seq(s)
670
+ if ss:
671
+ seqs.append(ss)
672
+ uniq_seqs = sorted(set(seqs))
673
+
674
+ logger.info(f"Total rows: {len(df)}, valid seqs: {len(seqs)}, unique: {len(uniq_seqs)}")
675
+
676
+ emb_dict: Dict[str, torch.Tensor] = {}
677
+ coord_dict: Dict[str, torch.Tensor] = {}
678
+ failures: List[Tuple[str, str]] = []
679
+
680
+ iterator = tqdm(uniq_seqs, desc="ESMfold Predicting structure...") if show_progress else uniq_seqs
681
+ for seq in tqdm(iterator):
682
+ if seq in emb_dict:
683
+ continue
684
+ try:
685
+ z_t, c_t = pipeline(seq) # z: [L, D], coords: [L, 3] (torch.Tensor)
686
+ emb_dict[seq] = z_t.detach().float().cpu()
687
+ coord_dict[seq] = c_t.detach().float().cpu()
688
+ except Exception as e:
689
+ failures.append((seq, repr(e)))
690
+ continue
691
+
692
+ logger.info(f"[DONE] OK: {len(emb_dict)}, Failed: {len(failures)}")
693
+ if failures[:3]:
694
+ logger.error("[SAMPLE failures]", failures[:3])
695
+ return emb_dict, coord_dict, failures
696
+
697
+ class ESMFoldEncoder(nn.Module):
698
+ def __init__(self, model_name="facebook/esmfold_v1", esm_cache_dir="esm_cache", cache_dir="cache"):
699
+ super(ESMFoldEncoder, self).__init__()
700
+ self.model_name = model_name
701
+ self.esm_cache_dir = esm_cache_dir
702
+ self.cache_dir = cache_dir
703
+
704
+ def save_obj(self, obj, path):
705
+ """Save object to a file (no compression)."""
706
+ torch.save(obj, path)
707
+
708
+ def load_obj(self, path):
709
+ """Load object from a file (no compression)."""
710
+ return torch.load(path, map_location='cpu', weights_only=False)
711
+
712
+ def load_esm_dict(self, device, df_data, chain, re_embed):
713
+
714
+ def _clean_unique(series: pd.Series) -> list:
715
+ cleaned = []
716
+ for s in series.astype(str).tolist():
717
+ ss = sanitize_protein_seq(s)
718
+ if ss:
719
+ cleaned.append(ss)
720
+ return sorted(set(cleaned))
721
+
722
+ def _retry_embed_df(
723
+ df: pd.DataFrame,
724
+ chain: str,
725
+ max_retries: int = 2,
726
+ show_progress: bool = True,
727
+ ):
728
+ """
729
+ Try to embed protein sequences with retries on failures.
730
+
731
+ Args:
732
+ df (pd.DataFrame): A DataFrame containing a column `chain` with sequences.
733
+ chain (str): The column name containing the sequences (e.g., "alpha", "beta").
734
+ pipeline: An embedding pipeline, should return (embedding, coords) for a sequence.
735
+ max_retries (int): Maximum number of retries for failed sequences.
736
+ show_progress (bool): Whether to display tqdm progress bars.
737
+
738
+ Returns:
739
+ feat_dict (Dict[str, torch.Tensor]): {sequence -> embedding tensor [L, D]}.
740
+ coord_dict (Dict[str, torch.Tensor]): {sequence -> coordinate tensor [L, 3]}.
741
+ failures (List[Tuple[str, str]]): List of (sequence, error_message) that still failed after retries.
742
+ """
743
+
744
+ pipeline = ResiduePipelineWithHFESM(
745
+ esm_model_name=self.model_name,
746
+ cache_dir=self.esm_cache_dir,
747
+ esm_device=device
748
+ )
749
+
750
+ # 1. First attempt
751
+ feat_dict, coord_dict, failures = batch_embed_to_dicts(
752
+ df, chain, pipeline, show_progress=show_progress
753
+ )
754
+
755
+ # 2. Retry loop for failed sequences
756
+ tries = 0
757
+ while failures and tries < max_retries:
758
+ tries += 1
759
+ retry_seqs = [s for s, _ in failures]
760
+ logger.info(f"[retry {tries}/{max_retries}] {len(retry_seqs)} sequences")
761
+ retry_df = pd.DataFrame({chain: retry_seqs})
762
+
763
+ f2, c2, failures = batch_embed_to_dicts(
764
+ retry_df, chain, pipeline, show_progress=show_progress
765
+ )
766
+ feat_dict.update(f2)
767
+ coord_dict.update(c2)
768
+
769
+ return feat_dict, coord_dict, failures
770
+
771
+ def update_with_new_seqs(feat_dict, coord_dict, chain):
772
+ base_dir = os.path.dirname(os.path.abspath(__file__))
773
+ base_dir = base_dir + "/" + self.cache_dir
774
+ os.makedirs(base_dir, exist_ok=True)
775
+ path_feat = os.path.join(base_dir, f"{chain}_feat_dict.pt")
776
+ path_coords = os.path.join(base_dir, f"{chain}_coord_dict.pt")
777
+
778
+ all_seqs_clean = _clean_unique(df_data[chain])
779
+ new_seqs = [s for s in all_seqs_clean if s not in feat_dict]
780
+ if not new_seqs:
781
+ logger.info(f"No new {chain} sequences found")
782
+ return feat_dict, coord_dict
783
+
784
+ logger.info(f"Found new {chain} sequences, embedding...")
785
+ df_new = pd.DataFrame({chain: new_seqs})
786
+ new_feat_dict, new_coord_dict, failures = _retry_embed_df(df_new, chain, max_retries=100)
787
+ feat_dict.update(new_feat_dict)
788
+ coord_dict.update(new_coord_dict)
789
+ self.save_obj(feat_dict, path_feat)
790
+ self.save_obj(coord_dict, path_coords)
791
+
792
+ if failures:
793
+ for seq, err in failures:
794
+ logger.error(f"[create] failed: {seq} | {err}")
795
+
796
+ logger.info(f"Updated and saved {path_feat} and {path_coords}")
797
+
798
+ return feat_dict, coord_dict
799
+
800
+ def get_or_create_dict(chain):
801
+ base_dir = os.path.dirname(os.path.abspath(__file__)) + "/" + self.cache_dir
802
+ os.makedirs(base_dir, exist_ok=True)
803
+ path_feat = os.path.join(base_dir, f"{chain}_feat_dict.pt")
804
+ path_coords = os.path.join(base_dir, f"{chain}_coord_dict.pt")
805
+
806
+ if os.path.exists(path_feat) and not re_embed:
807
+ logger.info(f"Loading {path_feat} and {path_coords}")
808
+ feat_dict = self.load_obj(path_feat)
809
+ coord_dict = self.load_obj(path_coords)
810
+ else:
811
+ logger.info(f"{path_feat} and {path_coords} not found or re_embed=True, generating...")
812
+ unique_seqs = _clean_unique(df_data[chain])
813
+ df_uniq = pd.DataFrame({chain: unique_seqs})
814
+ feat_dict, coord_dict, failures = _retry_embed_df(
815
+ df_uniq, chain, show_progress=True, max_retries=100
816
+ )
817
+ self.save_obj(feat_dict, path_feat)
818
+ self.save_obj(coord_dict, path_coords)
819
+
820
+ if failures:
821
+ for seq, err in failures:
822
+ logger.error(f"[create] failed: {seq} | {err}")
823
+
824
+ logger.info(f"Saved {path_feat} and {path_coords}")
825
+
826
+ return feat_dict, coord_dict
827
+
828
+ self.dict[chain+'_feat'], self.dict[chain+'_coord'] = update_with_new_seqs(*get_or_create_dict(chain), chain)
829
+
830
+ def pad_and_stack(self, batch_feats, L_max, batch_coors):
831
+ """
832
+ batch_feats: list of [L_i, D] tensors
833
+ batch_coors: list of [L_i, 3] tensors
834
+ return:
835
+ feats: [B, L_max, D]
836
+ coors: [B, L_max, 3]
837
+ mask : [B, L_max] (True for real tokens)
838
+ """
839
+ assert len(batch_feats) == len(batch_coors)
840
+ B = len(batch_feats)
841
+ D = batch_feats[0].shape[-1]
842
+
843
+ feats_pad = []
844
+ coors_pad = []
845
+ masks = []
846
+
847
+ for x, c in zip(batch_feats, batch_coors):
848
+ L = x.shape[0]
849
+ pad_L = L_max - L
850
+ # pad feats/coors with zeros
851
+ feats_pad.append(torch.nn.functional.pad(x, (0, 0, 0, pad_L))) # [L_max, D]
852
+ coors_pad.append(torch.nn.functional.pad(c, (0, 0, 0, pad_L))) # [L_max, 3]
853
+ m = torch.zeros(L_max, dtype=torch.bool)
854
+ m[:L] = True
855
+ masks.append(m)
856
+
857
+ feats = torch.stack(feats_pad, dim=0) # [B, L_max, D]
858
+ coors = torch.stack(coors_pad, dim=0) # [B, L_max, 3]
859
+ mask = torch.stack(masks, dim=0) # [B, L_max]
860
+ return feats, coors, mask
861
+
862
+ def forward(self, df_data, chain, device='cpu', re_embed=False):
863
+ """
864
+ df_data: pd.DataFrame with a column `chain` containing sequences
865
+ chain: str, e.g. "alpha" or "beta"
866
+ device: str, e.g. 'cpu' or 'cuda:0'
867
+ re_embed: bool, whether to re-embed even if cached files exist
868
+ """
869
+ self.dict = {}
870
+ self.load_esm_dict(device, df_data, chain, re_embed)
871
+
872
+ batch_feats = []
873
+ batch_coors = []
874
+ for seq in df_data[chain].astype(str).tolist():
875
+ ss = sanitize_protein_seq(seq)
876
+ if ss in self.dict[chain+'_feat'] and ss in self.dict[chain+'_coord']:
877
+ batch_feats.append(self.dict[chain+'_feat'][ss])
878
+ batch_coors.append(self.dict[chain+'_coord'][ss])
879
+ else:
880
+ raise ValueError(f"Sequence not found in embedding dict: {ss}")
881
+
882
+ # L_max = max(x.shape[0] for x in batch_feats)
883
+
884
+ return batch_feats, batch_coors
885
+
886
+
887
+ # =================================== Dataset / Collate ===========================================
888
+ class PepHLA_Dataset(torch.utils.data.Dataset):
889
+ def __init__(self, df, phys_dict, esm2_dict, struct_dict):
890
+ self.df = df
891
+ self.phys_dict = phys_dict
892
+ self.esm2_dict = esm2_dict
893
+ self.struct_dict = struct_dict
894
+
895
+ def __len__(self):
896
+ return len(self.df)
897
+
898
+ def __getitem__(self, idx):
899
+ row = self.df.iloc[idx]
900
+ pep = row['peptide']
901
+ hla = row['HLA_full']
902
+ label = torch.tensor(row['label'], dtype=torch.float32)
903
+
904
+ pep_phys = self.phys_dict['pep'][pep]
905
+ pep_esm = self.esm2_dict['pep'][pep]
906
+
907
+ hla_phys = self.phys_dict['hla'][hla]
908
+ hla_esm = self.esm2_dict['hla'][hla]
909
+ hla_struct, hla_coord = self.struct_dict[hla]
910
+
911
+ return {
912
+ 'pep_phys': pep_phys,
913
+ 'pep_esm': pep_esm,
914
+ 'hla_phys': hla_phys,
915
+ 'hla_esm': hla_esm,
916
+ 'hla_struct': hla_struct,
917
+ 'hla_coord': hla_coord,
918
+ 'label': label,
919
+ 'pep_id': pep,
920
+ 'hla_id': hla,
921
+ }
922
+
923
+ def peptide_hla_collate_fn(batch):
924
+ def pad_or_crop(x, original_len, target_len):
925
+ L, D = x.shape
926
+ valid_len = min(original_len, target_len)
927
+ valid_part = x[:valid_len]
928
+ if valid_len < target_len:
929
+ pad_len = target_len - valid_len
930
+ padding = x.new_zeros(pad_len, D)
931
+ return torch.cat([valid_part, padding], dim=0)
932
+ else:
933
+ return valid_part
934
+
935
+ out_batch = {}
936
+
937
+ pep_lens = [len(item['pep_id']) for item in batch]
938
+ max_pep_len = max(pep_lens)
939
+
940
+ for key in batch[0].keys():
941
+ if key == 'label':
942
+ out_batch[key] = torch.stack([item[key] for item in batch])
943
+ elif key.startswith('pep_') and not key.endswith('_id'):
944
+ out_batch[key] = torch.stack([pad_or_crop(item[key], len(item['pep_id']), max_pep_len) for item in batch])
945
+ elif key.endswith('_id'):
946
+ out_batch[key] = [item[key] for item in batch]
947
+ else:
948
+ out_batch[key] = torch.stack([item[key] for item in batch])
949
+
950
+ def make_mask(lengths, max_len):
951
+ masks = []
952
+ for L in lengths:
953
+ m = torch.zeros(max_len, dtype=torch.bool)
954
+ m[:L] = True
955
+ masks.append(m)
956
+ return torch.stack(masks)
957
+
958
+ out_batch['pep_mask'] = make_mask(pep_lens, max_pep_len)
959
+ return out_batch
960
+
961
+ # =================================== Dataset / Collate ===========================================
962
+ class TCRPepHLA_Dataset(torch.utils.data.Dataset):
963
+ """
964
+ Dataset for TCRα + TCRβ + peptide + HLA binding.
965
+ """
966
+ def __init__(self, df, phys_dict, esm2_dict, struct_dict, pep_hla_feat_dict):
967
+ self.df = df
968
+ self.phys_dict = phys_dict
969
+ self.esm2_dict = esm2_dict
970
+ self.struct_dict = struct_dict
971
+ self.pep_hla_feat_dict = pep_hla_feat_dict
972
+
973
+ def __len__(self):
974
+ return len(self.df)
975
+
976
+ def __getitem__(self, idx):
977
+ row = self.df.iloc[idx]
978
+ tcra = row['tcra']
979
+ tcrb = row['tcrb']
980
+ pep = row['peptide']
981
+ hla = row['HLA_full']
982
+ label = torch.tensor(row['label'], dtype=torch.float32)
983
+
984
+ # ---- TCRα ----
985
+ tcra_phys = self.phys_dict['tcra'][tcra]
986
+ tcra_esm = self.esm2_dict['tcra'][tcra]
987
+ tcra_struct, tcra_coord = self.struct_dict['tcra'][tcra]
988
+ tcra_cdr3_start = torch.tensor(row['cdr3a_start'], dtype=torch.long)
989
+ tcra_cdr3_end = torch.tensor(row['cdr3a_end'], dtype=torch.long)
990
+
991
+ # ---- TCRβ ----
992
+ tcrb_phys = self.phys_dict['tcrb'][tcrb]
993
+ tcrb_esm = self.esm2_dict['tcrb'][tcrb]
994
+ tcrb_struct, tcrb_coord = self.struct_dict['tcrb'][tcrb]
995
+ tcrb_cdr3_start = torch.tensor(row['cdr3b_start'], dtype=torch.long)
996
+ tcrb_cdr3_end = torch.tensor(row['cdr3b_end'], dtype=torch.long)
997
+
998
+ # ---- peptide ----
999
+ pep_phys = self.phys_dict['pep'][pep]
1000
+ pep_esm = self.esm2_dict['pep'][pep]
1001
+ pep_struct, pep_coord = self.struct_dict['pep'][pep]
1002
+
1003
+ # ---- HLA ----
1004
+ hla_phys = self.phys_dict['hla'][hla]
1005
+ hla_esm = self.esm2_dict['hla'][hla]
1006
+ hla_struct, hla_coord = self.struct_dict['hla'][hla]
1007
+
1008
+ feats = self.pep_hla_feat_dict[(pep, hla)]
1009
+ pep_feat_pretrain = feats['pep_feat_pretrain']
1010
+ hla_feat_pretrain = feats['hla_feat_pretrain']
1011
+
1012
+ return {
1013
+ # TCRα
1014
+ 'tcra_phys': tcra_phys,
1015
+ 'tcra_esm': tcra_esm,
1016
+ 'tcra_struct': tcra_struct,
1017
+ 'tcra_coord': tcra_coord,
1018
+ 'cdr3a_start': tcra_cdr3_start,
1019
+ 'cdr3a_end': tcra_cdr3_end,
1020
+
1021
+ # TCRβ
1022
+ 'tcrb_phys': tcrb_phys,
1023
+ 'tcrb_esm': tcrb_esm,
1024
+ 'tcrb_struct': tcrb_struct,
1025
+ 'tcrb_coord': tcrb_coord,
1026
+ 'cdr3b_start': tcrb_cdr3_start,
1027
+ 'cdr3b_end': tcrb_cdr3_end,
1028
+
1029
+ # peptide
1030
+ 'pep_phys': pep_phys,
1031
+ 'pep_esm': pep_esm,
1032
+ 'pep_struct': pep_struct,
1033
+ 'pep_coord': pep_coord,
1034
+
1035
+ # HLA
1036
+ 'hla_phys': hla_phys,
1037
+ 'hla_esm': hla_esm,
1038
+ 'hla_struct': hla_struct,
1039
+ 'hla_coord': hla_coord,
1040
+
1041
+ 'tcra_id': tcra,
1042
+ 'tcrb_id': tcrb,
1043
+ 'pep_id': pep,
1044
+ 'hla_id': hla,
1045
+ 'label': label,
1046
+
1047
+ 'pep_feat_pretrain': pep_feat_pretrain,
1048
+ 'hla_feat_pretrain': hla_feat_pretrain,
1049
+ }
1050
+
1051
+ # =================================== Collate Function ===========================================
1052
+ def tcr_pep_hla_collate_fn(batch):
1053
+ def pad_or_crop(x, original_len, target_len):
1054
+ L, D = x.shape
1055
+ valid_len = min(original_len, target_len)
1056
+ valid_part = x[:valid_len]
1057
+ if valid_len < target_len:
1058
+ pad_len = target_len - valid_len
1059
+ padding = x.new_zeros(pad_len, D)
1060
+ return torch.cat([valid_part, padding], dim=0)
1061
+ else:
1062
+ return valid_part
1063
+
1064
+ out_batch = {}
1065
+
1066
+ tcra_lens = [len(item['tcra_id']) for item in batch]
1067
+ tcrb_lens = [len(item['tcrb_id']) for item in batch]
1068
+ pep_lens = [len(item['pep_id']) for item in batch]
1069
+
1070
+ max_tcra_len = max(tcra_lens)
1071
+ max_tcrb_len = max(tcrb_lens)
1072
+ max_pep_len = max(pep_lens)
1073
+
1074
+ for key in batch[0].keys():
1075
+ if key == 'label':
1076
+ out_batch[key] = torch.stack([item[key] for item in batch])
1077
+
1078
+ elif key.startswith('tcra_') and not key.endswith('_id'):
1079
+ out_batch[key] = torch.stack([pad_or_crop(item[key], len(item['tcra_id']), max_tcra_len) for item in batch])
1080
+
1081
+ elif key.startswith('tcrb_') and not key.endswith('_id'):
1082
+ out_batch[key] = torch.stack([pad_or_crop(item[key], len(item['tcrb_id']), max_tcrb_len) for item in batch])
1083
+
1084
+ elif key.startswith('pep_') and not key.endswith('_id'):
1085
+ out_batch[key] = torch.stack([pad_or_crop(item[key], len(item['pep_id']), max_pep_len) for item in batch])
1086
+
1087
+ elif key.endswith('_id'):
1088
+ out_batch[key] = [item[key] for item in batch]
1089
+
1090
+ else:
1091
+ out_batch[key] = torch.stack([item[key] for item in batch])
1092
+
1093
+ def make_mask(lengths, max_len):
1094
+ masks = []
1095
+ for L in lengths:
1096
+ m = torch.zeros(max_len, dtype=torch.bool)
1097
+ m[:L] = True
1098
+ masks.append(m)
1099
+ return torch.stack(masks)
1100
+
1101
+ out_batch['tcra_mask'] = make_mask(tcra_lens, max_tcra_len)
1102
+ out_batch['tcrb_mask'] = make_mask(tcrb_lens, max_tcrb_len)
1103
+ out_batch['pep_mask'] = make_mask(pep_lens, max_pep_len)
1104
+
1105
+ return out_batch
1106
+
1107
+ # ==================================== 小积木:投影 + 门控 =========================================
1108
+ class ResidueProjector(nn.Module):
1109
+ """把不同分支的通道维度对齐到同一 D"""
1110
+ def __init__(self, in_dim, out_dim):
1111
+ super().__init__()
1112
+ self.proj = nn.Linear(in_dim, out_dim) if in_dim != out_dim else nn.Identity()
1113
+ def forward(self, x): # x: [B,L,Di]
1114
+ return self.proj(x)
1115
+
1116
+ class ResidueDoubleFusion(nn.Module):
1117
+ """
1118
+ ResidueDoubleFusion:
1119
+ A residue-level two-branch fusion module that combines two modalities (x1, x2)
1120
+ using cross-attention followed by gated residual fusion and linear projection.
1121
+
1122
+ Typical usage:
1123
+ - x1: physicochemical features
1124
+ - x2: ESM embeddings (or structure features)
1125
+ """
1126
+ def __init__(self, dim, num_heads=8, dropout=0.1):
1127
+ super().__init__()
1128
+ self.dim = dim
1129
+
1130
+ # Cross-attention: allows information flow between two modalities
1131
+ self.cross_attn = nn.MultiheadAttention(
1132
+ embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True
1133
+ )
1134
+
1135
+ # Gating mechanism: adaptively weight two modalities per residue
1136
+ self.gate = nn.Sequential(
1137
+ nn.Linear(dim * 2, dim),
1138
+ nn.ReLU(),
1139
+ nn.Linear(dim, 1),
1140
+ nn.Sigmoid()
1141
+ )
1142
+
1143
+ # Optional projection after fusion
1144
+ self.out_proj = nn.Linear(dim, dim)
1145
+
1146
+ # Layer norms for stable training
1147
+ self.norm_x1 = nn.LayerNorm(dim)
1148
+ self.norm_x2 = nn.LayerNorm(dim)
1149
+ self.norm_out = nn.LayerNorm(dim)
1150
+
1151
+ def forward(self, x1, x2):
1152
+ """
1153
+ Args:
1154
+ x1: Tensor [B, L, D] - first modality (e.g., physicochemical)
1155
+ x2: Tensor [B, L, D] - second modality (e.g., ESM embeddings)
1156
+ Returns:
1157
+ fused: Tensor [B, L, D] - fused residue-level representation
1158
+ """
1159
+
1160
+ # 1) Normalize both branches
1161
+ x1_norm = self.norm_x1(x1)
1162
+ x2_norm = self.norm_x2(x2)
1163
+
1164
+ # 2) Cross-attention (x1 queries, x2 keys/values)
1165
+ # This allows x1 to attend to x2 at each residue position
1166
+ attn_out, _ = self.cross_attn(
1167
+ query=x1_norm,
1168
+ key=x2_norm,
1169
+ value=x2_norm
1170
+ ) # [B, L, D]
1171
+
1172
+ # 3) Gating between original x1 and attention-enhanced x2
1173
+ gate_val = self.gate(torch.cat([x1, attn_out], dim=-1)) # [B, L, 1]
1174
+ fused = gate_val * x1 + (1 - gate_val) * attn_out
1175
+
1176
+ # 4) Optional projection + normalization
1177
+ fused = self.out_proj(fused)
1178
+ fused = self.norm_out(fused)
1179
+
1180
+ return fused
1181
+
1182
+ class ResidueTripleFusion(nn.Module):
1183
+ """
1184
+ ResidueTripleFusion:
1185
+ A hierarchical three-branch feature fusion module for residue-level representations.
1186
+
1187
+ Step 1: Fuse physicochemical features and protein language model embeddings.
1188
+ Step 2: Fuse the intermediate representation with structure-based features.
1189
+
1190
+ Each fusion step uses ResidueDoubleFusion (cross-attention + gating + linear projection).
1191
+ """
1192
+ def __init__(self, dim, num_heads=8, dropout=0.1):
1193
+ super().__init__()
1194
+ # Fuse physicochemical + ESM embeddings
1195
+ self.fuse_phys_esm = ResidueDoubleFusion(dim, num_heads=num_heads, dropout=dropout)
1196
+ # Fuse the fused phys+esm representation with structure embeddings
1197
+ self.fuse_f12_struct = ResidueDoubleFusion(dim, num_heads=num_heads, dropout=dropout)
1198
+
1199
+ def forward(self, phys, esm, struct):
1200
+ """
1201
+ Args:
1202
+ phys: Tensor [B, L, D], physicochemical features (e.g., AAindex-based)
1203
+ esm: Tensor [B, L, D], protein language model embeddings (e.g., ESM2, ProtT5)
1204
+ struct: Tensor [B, L, D], structure-derived features (e.g., torsion, RSA)
1205
+
1206
+ Returns:
1207
+ fused: Tensor [B, L, D], final fused representation
1208
+ """
1209
+ # Step 1: Fuse physicochemical and ESM embeddings
1210
+ f12 = self.fuse_phys_esm(phys, esm)
1211
+
1212
+ # Step 2: Fuse the intermediate fused representation with structure features
1213
+ fused = self.fuse_f12_struct(f12, struct)
1214
+
1215
+ return fused
1216
+
1217
+ class BANLayer(nn.Module):
1218
+ """
1219
+ Bilinear Attention Network Layer with proper 2D masked-softmax.
1220
+ v_mask: [B, L_v] True=valid
1221
+ q_mask: [B, L_q] True=valid
1222
+ """
1223
+ def __init__(self, v_dim, q_dim, h_dim, h_out, act='ReLU', dropout=0.2, k=3):
1224
+ super().__init__()
1225
+ self.c = 32
1226
+ self.k = k
1227
+ self.v_dim = v_dim
1228
+ self.q_dim = q_dim
1229
+ self.h_dim = h_dim
1230
+ self.h_out = h_out
1231
+
1232
+ self.v_net = FCNet([v_dim, h_dim * self.k], act=act, dropout=dropout)
1233
+ self.q_net = FCNet([q_dim, h_dim * self.k], act=act, dropout=dropout)
1234
+
1235
+ if 1 < k:
1236
+ self.p_net = nn.AvgPool1d(self.k, stride=self.k)
1237
+
1238
+ if h_out <= self.c:
1239
+ self.h_mat = nn.Parameter(torch.Tensor(1, h_out, 1, h_dim * self.k).normal_())
1240
+ self.h_bias = nn.Parameter(torch.Tensor(1, h_out, 1, 1).normal_())
1241
+ else:
1242
+ self.h_net = weight_norm(nn.Linear(h_dim * self.k, h_out), dim=None)
1243
+
1244
+ self.bn = nn.BatchNorm1d(h_dim)
1245
+
1246
+ def attention_pooling(self, v, q, att_map): # att_map: [B, L_v, L_q]
1247
+ logits = torch.einsum('bvk,bvq,bqk->bk', (v, att_map, q))
1248
+ if 1 < self.k:
1249
+ logits = self.p_net(logits.unsqueeze(1)).squeeze(1) * self.k
1250
+ return logits
1251
+
1252
+ def _masked_softmax_2d(self, logits, v_mask, q_mask):
1253
+ """
1254
+ logits: [B, h_out, L_v, L_q]
1255
+ v_mask: [B, L_v] or None
1256
+ q_mask: [B, L_q] or None
1257
+ return: probs [B, h_out, L_v, L_q] (masked entries=0, 在有效的二维子矩阵内归一化)
1258
+ """
1259
+ B, H, Lv, Lq = logits.shape
1260
+ device = logits.device
1261
+ if v_mask is None:
1262
+ v_mask = torch.ones(B, Lv, dtype=torch.bool, device=device)
1263
+ if q_mask is None:
1264
+ q_mask = torch.ones(B, Lq, dtype=torch.bool, device=device)
1265
+
1266
+ mask2d = (v_mask[:, :, None] & q_mask[:, None, :]) # [B, Lv, Lq]
1267
+ mask2d = mask2d[:, None, :, :].expand(B, H, Lv, Lq) # [B, H, Lv, Lq]
1268
+
1269
+ logits = logits.masked_fill(~mask2d, -float('inf'))
1270
+
1271
+ # 在 Lv*Lq 的联合空间做 softmax
1272
+ flat = logits.view(B, H, -1) # [B, H, Lv*Lq]
1273
+ # 处理极端情况:某些样本可能无有效格子,避免 NaN
1274
+ flat = torch.where(torch.isinf(flat), torch.full_like(flat, -1e9), flat)
1275
+ flat = F.softmax(flat, dim=-1)
1276
+ flat = torch.nan_to_num(flat, nan=0.0) # 安全兜底
1277
+ probs = flat.view(B, H, Lv, Lq)
1278
+
1279
+ # 把被 mask 的位置清零(数值稳定 & 便于可视化)
1280
+ probs = probs * mask2d.float()
1281
+ return probs
1282
+
1283
+ def forward(self, v, q, v_mask=None, q_mask=None, softmax=True):
1284
+ """
1285
+ v: [B, L_v, Dv], q: [B, L_q, Dq]
1286
+ """
1287
+ B, L_v, _ = v.size()
1288
+ _, L_q, _ = q.size()
1289
+
1290
+ v_ = self.v_net(v) # [B, L_v, h_dim*k]
1291
+ q_ = self.q_net(q) # [B, L_q, h_dim*k]
1292
+
1293
+ if self.h_out <= self.c:
1294
+ att_maps = torch.einsum('xhyk,bvk,bqk->bhvq', (self.h_mat, v_, q_)) + self.h_bias # [B,H,Lv,Lq]
1295
+ else:
1296
+ v_t = v_.transpose(1, 2).unsqueeze(3) # [B, K, Lv, 1]
1297
+ q_t = q_.transpose(1, 2).unsqueeze(2) # [B, K, 1, Lq]
1298
+ d_ = torch.matmul(v_t, q_t) # [B, K, Lv, Lq]
1299
+ att_maps = self.h_net(d_.permute(0, 2, 3, 1)) # [B, Lv, Lq, H]
1300
+ att_maps = att_maps.permute(0, 3, 1, 2) # [B, H, Lv, Lq]
1301
+
1302
+ if softmax:
1303
+ att_maps = self._masked_softmax_2d(att_maps, v_mask, q_mask)
1304
+ else:
1305
+ # 即使不 softmax,也把无效格子清 0,避免泄漏
1306
+ if v_mask is not None:
1307
+ att_maps = att_maps.masked_fill(~v_mask[:, None, :, None], 0.0)
1308
+ if q_mask is not None:
1309
+ att_maps = att_maps.masked_fill(~q_mask[:, None, None, :], 0.0)
1310
+
1311
+ # 注意:此时 v_ / q_ 仍是 [B, L, K],与 att_maps 的 [B,H,Lv,Lq] 对齐
1312
+ logits = self.attention_pooling(v_, q_, att_maps[:, 0, :, :])
1313
+ for i in range(1, self.h_out):
1314
+ logits = logits + self.attention_pooling(v_, q_, att_maps[:, i, :, :])
1315
+
1316
+ logits = self.bn(logits)
1317
+ return logits, att_maps
1318
+
1319
+ class FCNet(nn.Module):
1320
+ def __init__(self, dims, act='ReLU', dropout=0.2):
1321
+ super(FCNet, self).__init__()
1322
+
1323
+ layers = []
1324
+ for i in range(len(dims) - 2):
1325
+ in_dim = dims[i]
1326
+ out_dim = dims[i + 1]
1327
+ if 0 < dropout:
1328
+ layers.append(nn.Dropout(dropout))
1329
+ layers.append(weight_norm(nn.Linear(in_dim, out_dim), dim=None))
1330
+ if '' != act:
1331
+ layers.append(getattr(nn, act)())
1332
+ if 0 < dropout:
1333
+ layers.append(nn.Dropout(dropout))
1334
+ layers.append(weight_norm(nn.Linear(dims[-2], dims[-1]), dim=None))
1335
+ if '' != act:
1336
+ layers.append(getattr(nn, act)())
1337
+
1338
+ self.main = nn.Sequential(*layers)
1339
+
1340
+ def forward(self, x):
1341
+ return self.main(x)
1342
+
1343
+ class StackedEGNN(nn.Module):
1344
+ def __init__(self, dim, layers, update_coors=False, **egnn_kwargs):
1345
+ super().__init__()
1346
+ self.layers = nn.ModuleList([
1347
+ EGNN(dim=dim, update_coors=update_coors, **egnn_kwargs)
1348
+ for _ in range(layers)
1349
+ ])
1350
+
1351
+ def forward(self, feats, coors, mask=None):
1352
+ # feats: [B, L_max, D], coors: [B, L_max, 3], mask: [B, L_max] (bool)
1353
+ for layer in self.layers:
1354
+ feats, coors = layer(feats, coors, mask=mask)
1355
+ return feats, coors
1356
+
1357
+ class FocalLoss(nn.Module):
1358
+ def __init__(self, alpha=0.5, gamma=2, reduction='mean'):
1359
+ super(FocalLoss, self).__init__()
1360
+ self.alpha = alpha
1361
+ self.gamma = gamma
1362
+ self.reduction = reduction
1363
+
1364
+ def forward(self, inputs, targets):
1365
+ bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
1366
+ p_t = torch.exp(-bce_loss)
1367
+
1368
+ alpha_weight = self.alpha * targets + (1 - self.alpha) * (1 - targets)
1369
+ loss = alpha_weight * (1 - p_t) ** self.gamma * bce_loss
1370
+
1371
+ if self.reduction == 'mean':
1372
+ return torch.mean(loss)
1373
+ elif self.reduction == 'sum':
1374
+ return torch.sum(loss)
1375
+ else:
1376
+ return loss
1377
+
1378
+ # ===================================== 主模型(完全版) ===========================================
1379
+ class PeptideHLABindingPredictor(nn.Module):
1380
+ def __init__(
1381
+ self,
1382
+ phys_dim=20, # 物化编码的输出维度(你定义的 PhysicochemicalEncoder)
1383
+ pep_dim=256, # 统一后的 peptide 通道
1384
+ hla_dim=256, # 统一后的 HLA 通道
1385
+ bilinear_dim=256,
1386
+ pseudo_seq_pos=None, # 口袋位点(假定 0-based 且落在 [0,179])
1387
+ device="cuda:0",
1388
+ loss_fn='bce',
1389
+ alpha=0.5,
1390
+ gamma=2.0,
1391
+ dropout=0.2,
1392
+ pos_weights=None
1393
+ ):
1394
+ super().__init__()
1395
+ self.device = device
1396
+ self.pep_dim = pep_dim
1397
+ self.hla_dim = hla_dim
1398
+ self.bilinear_dim = bilinear_dim
1399
+ self.alpha = alpha
1400
+ self.gamma = gamma
1401
+ self.dropout = dropout
1402
+ if loss_fn == 'bce':
1403
+ self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weights]) if pos_weights is not None else None)
1404
+ elif loss_fn == 'focal':
1405
+ self.loss_fn = FocalLoss(alpha=alpha, gamma=gamma)
1406
+ else:
1407
+ raise ValueError(f"Unknown loss function: {loss_fn}")
1408
+
1409
+ self.se3_model = StackedEGNN(
1410
+ dim=17, layers=3
1411
+ )
1412
+
1413
+ self.max_pep_len = 20
1414
+ self.max_hla_len = 180
1415
+
1416
+ self.pep_pos_embed = nn.Parameter(torch.randn(self.max_pep_len, pep_dim))
1417
+ self.hla_pos_embed = nn.Parameter(torch.randn(self.max_hla_len, hla_dim))
1418
+
1419
+ # —— 分支投影到统一维度(逐残基)——
1420
+ # peptide 分支(Physicochem -> pep_dim, ESM2(1280) -> pep_dim)
1421
+ self.proj_pep_phys = ResidueProjector(in_dim=phys_dim, out_dim=pep_dim) # 你的 PhysEnc 输出维设成 pep_dim
1422
+ self.proj_pep_esm = ResidueProjector(in_dim=1280, out_dim=pep_dim)
1423
+
1424
+ # HLA 分支(Physicochem -> hla_dim, ESM2(1280) -> hla_dim, Struct(17/或se3_out) -> hla_dim)
1425
+ self.proj_hla_phys = ResidueProjector(in_dim=phys_dim, out_dim=hla_dim) # 你的 PhysEnc 输出维设成 hla_dim
1426
+ self.proj_hla_esm = ResidueProjector(in_dim=1280, out_dim=hla_dim)
1427
+ self.proj_hla_se3 = ResidueProjector(in_dim=17, out_dim=hla_dim) # 让 se3_model 输出维就是 hla_dim
1428
+
1429
+ # —— 门控融合(逐残基)——
1430
+ self.gate_pep = ResidueDoubleFusion(pep_dim) # pep_phys × pep_esm
1431
+ self.gate_hla = ResidueTripleFusion(hla_dim) # hla_phys × hla_esm × hla_struct
1432
+
1433
+ d_model = self.pep_dim
1434
+ n_heads = 8
1435
+
1436
+ # 1. 用于 "Peptide 查询 HLA" (pep_q_hla_kv)
1437
+ self.cross_attn_pep_hla = nn.MultiheadAttention(
1438
+ embed_dim=d_model,
1439
+ num_heads=n_heads,
1440
+ dropout=self.dropout,
1441
+ batch_first=True
1442
+ )
1443
+ self.norm_cross_pep = nn.LayerNorm(d_model)
1444
+
1445
+ # 2. 用于 "HLA 查询 Peptide" (hla_q_pep_kv)
1446
+ self.cross_attn_hla_pep = nn.MultiheadAttention(
1447
+ embed_dim=d_model,
1448
+ num_heads=n_heads,
1449
+ dropout=self.dropout,
1450
+ batch_first=True
1451
+ )
1452
+ self.norm_cross_hla = nn.LayerNorm(d_model)
1453
+
1454
+ # —— 交互模块(Bilinear attention map)——
1455
+ self.bi_attn = BANLayer(v_dim=pep_dim, q_dim=hla_dim, h_dim=bilinear_dim, h_out=4, k=3)
1456
+
1457
+ # —— 头部 ——
1458
+ self.head = nn.Sequential(
1459
+ nn.Linear(bilinear_dim, bilinear_dim),
1460
+ nn.ReLU(),
1461
+ nn.Linear(bilinear_dim, 1)
1462
+ )
1463
+
1464
+ # —— 口袋位点 ——
1465
+ if pseudo_seq_pos is None:
1466
+ pseudo_seq_pos = [i-2 for i in [7, 9, 24, 45, 59, 62, 63, 66, 67, 69, 70, 73, 74, 76, 77, 80, 81, 84, 95, 97, 99, 114, 116, 118, 143, 147, 150, 152, 156, 158, 159, 163, 167, 171]]
1467
+ self.register_buffer("contact_idx", torch.tensor(pseudo_seq_pos, dtype=torch.long))
1468
+
1469
+ # --------------------------------------------
1470
+ # Transformer Encoders for peptide & HLA
1471
+ # --------------------------------------------
1472
+ encoder_layer_pep = TransformerEncoderLayer(
1473
+ d_model=pep_dim, # 输入维度
1474
+ nhead=8, # 注意力头数(可调)
1475
+ dim_feedforward=pep_dim*4,
1476
+ dropout=self.dropout,
1477
+ batch_first=True # 输入形状 [B,L,D]
1478
+ )
1479
+ self.pep_encoder = TransformerEncoder(encoder_layer_pep, num_layers=2) # 可以调整层数
1480
+
1481
+ encoder_layer_hla = TransformerEncoderLayer(
1482
+ d_model=hla_dim,
1483
+ nhead=8,
1484
+ dim_feedforward=hla_dim*4,
1485
+ dropout=self.dropout,
1486
+ batch_first=True
1487
+ )
1488
+ self.hla_encoder = TransformerEncoder(encoder_layer_hla, num_layers=1)
1489
+
1490
+ # -------------------------- 工具:把 list of [L,D] pad 成 [B,L_max,D] --------------------------
1491
+ def _pad_stack(self, tensors, L_max=None):
1492
+ Ls = [t.shape[0] for t in tensors]
1493
+ if L_max is None: L_max = max(Ls)
1494
+ D = tensors[0].shape[-1]
1495
+ B = len(tensors)
1496
+ out = tensors[0].new_zeros((B, L_max, D))
1497
+ mask = torch.zeros(B, L_max, dtype=torch.bool, device=out.device)
1498
+ for i, t in enumerate(tensors):
1499
+ L = t.shape[0]
1500
+ out[i, :L] = t
1501
+ mask[i, :L] = True
1502
+ return out, mask # [B,L_max,D], [B,L_max]
1503
+
1504
+ # ----------------------------------- 口袋掩码 --------------------------------------
1505
+
1506
+ def _mask_to_pockets(self, hla_feat):
1507
+ """
1508
+ 从 HLA 特征中只保留 pocket 位点,返回:
1509
+ - hla_pocket: [B, n_pocket, D]
1510
+ - pocket_mask: [B, n_pocket] (全部 True)
1511
+ """
1512
+ B, L, D = hla_feat.shape
1513
+
1514
+ # ensure idx in [0, L-1]
1515
+ idx = self.contact_idx.clamp(min=0, max=L-1)
1516
+ # gather pocket features
1517
+ hla_pocket = hla_feat[:, idx, :] # [B, n_pocket, D]
1518
+
1519
+ return hla_pocket
1520
+
1521
+ def add_positional_encoding(self, x, pos_embed):
1522
+ """
1523
+ x: [B, L, D]
1524
+ pos_embed: [L_max, D]
1525
+ """
1526
+ B, L, D = x.shape
1527
+ # 截取前 L 个位置编码
1528
+ pe = pos_embed[:L, :].unsqueeze(0).expand(B, -1, -1) # [B, L, D]
1529
+ return x + pe
1530
+
1531
+ def forward(self, batch):
1532
+ # take batch from DataLoader
1533
+ pep_phys = batch['pep_phys'].to(self.device, non_blocking=True)
1534
+ pep_esm = batch['pep_esm'].to(self.device, non_blocking=True)
1535
+ hla_phys = batch['hla_phys'].to(self.device, non_blocking=True)
1536
+ hla_esm = batch['hla_esm'].to(self.device, non_blocking=True)
1537
+ hla_struct = batch['hla_struct'].to(self.device, non_blocking=True)
1538
+ hla_coord = batch['hla_coord'].to(self.device, non_blocking=True)
1539
+ labels = batch['label'].to(self.device)
1540
+
1541
+ # 1) peptide 物化 + ESM2 → gate 融合
1542
+ pep_phys = self.proj_pep_phys(pep_phys)
1543
+ pep_esm = self.proj_pep_esm(pep_esm)
1544
+ pep_feat = self.gate_pep(pep_phys, pep_esm) # [B, Lp, D]
1545
+
1546
+ pep_feat = self.add_positional_encoding(pep_feat, self.pep_pos_embed)
1547
+ pep_feat = self.pep_encoder(pep_feat, src_key_padding_mask=~batch['pep_mask'].to(self.device, non_blocking=True))
1548
+
1549
+ # 2) HLA 物化 + ESM2 + 结构 → SE3 → gate 融合
1550
+ hla_phys = self.proj_hla_phys(hla_phys)
1551
+ hla_esm = self.proj_hla_esm(hla_esm)
1552
+ # hla_struct 是 [B, 180, 17],先过 SE3
1553
+ hla_se3 = self.se3_model(hla_struct, hla_coord, None)[0] # [B, 180, 17]
1554
+ hla_se3 = self.proj_hla_se3(hla_se3) # →256
1555
+ hla_feat = self.gate_hla(hla_phys, hla_esm, hla_se3)
1556
+
1557
+ hla_feat = self.add_positional_encoding(hla_feat, self.hla_pos_embed)
1558
+ hla_feat = self.hla_encoder(hla_feat)
1559
+
1560
+ # cross attention for pep
1561
+ pep_feat_cross, _ = self.cross_attn_pep_hla(
1562
+ query=pep_feat,
1563
+ key=hla_feat,
1564
+ value=hla_feat,
1565
+ key_padding_mask=None
1566
+ )
1567
+
1568
+ # cross attention for hla
1569
+ hla_feat_cross, _ = self.cross_attn_hla_pep(
1570
+ query=hla_feat,
1571
+ key=pep_feat,
1572
+ value=pep_feat,
1573
+ key_padding_mask=~batch['pep_mask'].to(self.device, non_blocking=True)
1574
+ )
1575
+
1576
+ pep_feat_updated = self.norm_cross_pep(pep_feat + pep_feat_cross)
1577
+ hla_feat_updated = self.norm_cross_hla(hla_feat + hla_feat_cross)
1578
+
1579
+ # 3) mask HLA 口袋位点
1580
+ hla_pocket = self._mask_to_pockets(hla_feat_updated)
1581
+
1582
+ # 4) bilinear attention
1583
+ fused_vec, attn = self.bi_attn(
1584
+ pep_feat_updated,
1585
+ hla_pocket,
1586
+ v_mask=batch['pep_mask'].to(self.device, non_blocking=True),
1587
+ q_mask=None
1588
+ )
1589
+ logits = self.head(fused_vec).squeeze(-1)
1590
+
1591
+ probs = torch.sigmoid(logits).detach().cpu().numpy()
1592
+
1593
+ binding_loss = self.loss_fn(logits, labels.float())
1594
+
1595
+ return probs, binding_loss, attn.detach().cpu().numpy().sum(axis=1), fused_vec.detach().cpu().numpy()
1596
+
1597
+ # -------------------------- 编码器复用接口(给 TCR-HLA 模型用) --------------------------
1598
+ def _pad_peptide(self, x, max_len):
1599
+ """Pad peptide feature tensor [1, L, D] to [1, max_len, D]."""
1600
+ B, L, D = x.shape
1601
+ if L < max_len:
1602
+ pad = x.new_zeros(B, max_len - L, D)
1603
+ return torch.cat([x, pad], dim=1)
1604
+ else:
1605
+ return x[:, :max_len, :]
1606
+
1607
+ @torch.no_grad()
1608
+ def encode_peptide_hla(self, pep_id, pep_phys, pep_esm, hla_phys, hla_esm, hla_struct, hla_coord, max_pep_len):
1609
+ Lp = len(pep_id)
1610
+
1611
+ pep_phys = self.proj_pep_phys(pep_phys)
1612
+ pep_esm = self.proj_pep_esm(pep_esm)
1613
+
1614
+ pep_phys = self._pad_peptide(pep_phys, max_pep_len)
1615
+ pep_esm = self._pad_peptide(pep_esm, max_pep_len)
1616
+
1617
+ device = pep_phys.device
1618
+ pep_mask = torch.zeros(1, max_pep_len, dtype=torch.bool, device=device)
1619
+ pep_mask[0, :Lp] = True
1620
+
1621
+ pep_feat = self.gate_pep(pep_phys, pep_esm)
1622
+ pep_feat = self.add_positional_encoding(pep_feat, self.pep_pos_embed)
1623
+ pep_feat = self.pep_encoder(pep_feat, src_key_padding_mask=~pep_mask)
1624
+
1625
+ # 2) hla encoding
1626
+ hla_phys = self.proj_hla_phys(hla_phys)
1627
+ hla_esm = self.proj_hla_esm(hla_esm)
1628
+ hla_se3 = self.se3_model(hla_struct, hla_coord, None)[0]
1629
+ hla_se3 = self.proj_hla_se3(hla_se3)
1630
+ hla_feat = self.gate_hla(hla_phys, hla_esm, hla_se3)
1631
+ hla_feat = self.add_positional_encoding(hla_feat, self.hla_pos_embed)
1632
+ hla_feat = self.hla_encoder(hla_feat)
1633
+
1634
+ # --- 3a. Peptide (Q) 查询 HLA (K, V) ---
1635
+ pep_feat_cross, _ = self.cross_attn_pep_hla(
1636
+ query=pep_feat,
1637
+ key=hla_feat,
1638
+ value=hla_feat,
1639
+ key_padding_mask=None
1640
+ )
1641
+ pep_feat_updated = self.norm_cross_pep(pep_feat + pep_feat_cross)
1642
+
1643
+ # --- 3b. HLA (Q) 查询 Peptide (K, V) ---
1644
+ hla_feat_cross, _ = self.cross_attn_hla_pep(
1645
+ query=hla_feat,
1646
+ key=pep_feat,
1647
+ value=pep_feat,
1648
+ key_padding_mask=~pep_mask
1649
+ )
1650
+ hla_feat_updated = self.norm_cross_hla(hla_feat + hla_feat_cross)
1651
+
1652
+ return pep_feat_updated, hla_feat_updated
1653
+
1654
+ class TCRPeptideHLABindingPredictor(nn.Module):
1655
+ def __init__(
1656
+ self,
1657
+ tcr_dim=256,
1658
+ pep_dim=256,
1659
+ hla_dim=256,
1660
+ bilinear_dim=256,
1661
+ loss_fn='bce',
1662
+ alpha=0.5,
1663
+ gamma=2.0,
1664
+ dropout=0.1,
1665
+ device='cuda:0',
1666
+ pos_weights=None
1667
+ ):
1668
+ super().__init__()
1669
+
1670
+ # TCR α / β position embeddings
1671
+ self.max_tcra_len = 500
1672
+ self.max_tcrb_len = 500
1673
+ self.max_pep_len = 20
1674
+ self.max_hla_len = 180
1675
+ self.alpha = alpha
1676
+ self.gamma = gamma
1677
+ self.dropout = dropout
1678
+
1679
+ if loss_fn == 'bce':
1680
+ self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weights]) if pos_weights is not None else None)
1681
+ elif loss_fn == 'focal':
1682
+ self.loss_fn = FocalLoss(alpha=alpha, gamma=gamma)
1683
+ else:
1684
+ raise ValueError(f"Unknown loss function: {loss_fn}")
1685
+
1686
+ self.tcra_pos_embed = nn.Parameter(torch.randn(self.max_tcra_len, tcr_dim))
1687
+ self.tcrb_pos_embed = nn.Parameter(torch.randn(self.max_tcrb_len, tcr_dim))
1688
+ self.pep_pos_embed = nn.Parameter(torch.randn(self.max_pep_len, pep_dim))
1689
+ self.hla_pos_embed = nn.Parameter(torch.randn(self.max_hla_len, hla_dim))
1690
+
1691
+ self.device = device
1692
+ self.tcr_dim = tcr_dim
1693
+ self.pep_dim = pep_dim
1694
+ self.hla_dim = hla_dim
1695
+ self.bilinear_dim = bilinear_dim
1696
+
1697
+ d_model = tcr_dim
1698
+ n_heads = 8
1699
+
1700
+ self.cross_attn_tcra_pep = nn.MultiheadAttention(d_model, n_heads, dropout=self.dropout, batch_first=True)
1701
+ self.cross_attn_tcra_hla = nn.MultiheadAttention(d_model, n_heads, dropout=self.dropout, batch_first=True)
1702
+ self.cross_attn_tcrb_pep = nn.MultiheadAttention(d_model, n_heads, dropout=self.dropout, batch_first=True)
1703
+ self.cross_attn_tcrb_hla = nn.MultiheadAttention(d_model, n_heads, dropout=self.dropout, batch_first=True)
1704
+ self.norm_tcra_pep = nn.LayerNorm(d_model)
1705
+ self.norm_tcra_hla = nn.LayerNorm(d_model)
1706
+ self.norm_tcrb_pep = nn.LayerNorm(d_model)
1707
+ self.norm_tcrb_hla = nn.LayerNorm(d_model)
1708
+
1709
+ # =======================
1710
+ # TCRα / TCRβ encoders
1711
+ # =======================
1712
+ def make_tcr_encoder():
1713
+ proj_phys = ResidueProjector(20, tcr_dim)
1714
+ proj_esm = ResidueProjector(1280, tcr_dim)
1715
+ proj_struct = ResidueProjector(17, tcr_dim)
1716
+ se3 = StackedEGNN(dim=17, layers=1)
1717
+ gate = ResidueTripleFusion(tcr_dim)
1718
+ encoder_layer = TransformerEncoderLayer(
1719
+ d_model=tcr_dim, nhead=8, dim_feedforward=tcr_dim*4, dropout=self.dropout, batch_first=True
1720
+ )
1721
+ encoder = TransformerEncoder(encoder_layer, num_layers=2)
1722
+ return nn.ModuleDict(dict(
1723
+ proj_phys=proj_phys, proj_esm=proj_esm, proj_struct=proj_struct,
1724
+ se3=se3, gate=gate, encoder=encoder
1725
+ ))
1726
+
1727
+ self.tcra_enc = make_tcr_encoder()
1728
+ self.tcrb_enc = make_tcr_encoder()
1729
+
1730
+ # =======================
1731
+ # Peptide encoder (phys + esm + structure)
1732
+ # =======================
1733
+ self.proj_pep_phys = ResidueProjector(20, pep_dim)
1734
+ self.proj_pep_esm = ResidueProjector(1280, pep_dim)
1735
+ self.proj_pep_struct = ResidueProjector(17, pep_dim)
1736
+ self.pep_se3 = StackedEGNN(dim=17, layers=1)
1737
+ self.pep_gate = ResidueTripleFusion(pep_dim)
1738
+ pep_encoder_layer = TransformerEncoderLayer(
1739
+ d_model=pep_dim, nhead=8, dim_feedforward=pep_dim*4, dropout=self.dropout, batch_first=True
1740
+ )
1741
+ self.pep_encoder = TransformerEncoder(pep_encoder_layer, num_layers=2)
1742
+
1743
+ # =======================
1744
+ # HLA encoder
1745
+ # =======================
1746
+ self.proj_hla_phys = ResidueProjector(20, hla_dim)
1747
+ self.proj_hla_esm = ResidueProjector(1280, hla_dim)
1748
+ self.proj_hla_struct = ResidueProjector(17, hla_dim)
1749
+ self.hla_se3 = StackedEGNN(dim=17, layers=1)
1750
+ self.hla_gate = ResidueTripleFusion(hla_dim)
1751
+ hla_encoder_layer = TransformerEncoderLayer(
1752
+ d_model=hla_dim, nhead=8, dim_feedforward=hla_dim*4, dropout=self.dropout, batch_first=True
1753
+ )
1754
+ self.hla_encoder = TransformerEncoder(hla_encoder_layer, num_layers=1)
1755
+
1756
+ self.pep_gate_2 = ResidueDoubleFusion(pep_dim)
1757
+ self.hla_gate_2 = ResidueDoubleFusion(hla_dim)
1758
+
1759
+ # =======================
1760
+ # Bilinear interactions
1761
+ # =======================
1762
+ self.bi_tcra_pep = BANLayer(tcr_dim, pep_dim, bilinear_dim, h_out=4, k=3)
1763
+ self.bi_tcrb_pep = BANLayer(tcr_dim, pep_dim, bilinear_dim, h_out=4, k=3)
1764
+ self.bi_tcra_hla = BANLayer(tcr_dim, hla_dim, bilinear_dim, h_out=4, k=3)
1765
+ self.bi_tcrb_hla = BANLayer(tcr_dim, hla_dim, bilinear_dim, h_out=4, k=3)
1766
+
1767
+ # =======================
1768
+ # Head
1769
+ # =======================
1770
+ total_fused_dim = bilinear_dim * 4
1771
+ self.head = nn.Sequential(
1772
+ nn.Linear(total_fused_dim, bilinear_dim),
1773
+ nn.ReLU(),
1774
+ nn.Linear(bilinear_dim, 1)
1775
+ )
1776
+
1777
+ def encode_tcr(self, x_phys, x_esm, x_struct, x_coord, x_mask, enc, pos_embed):
1778
+ phys = enc['proj_phys'](x_phys)
1779
+ esm = enc['proj_esm'](x_esm)
1780
+ se3 = enc['se3'](x_struct, x_coord, None)[0]
1781
+ se3 = enc['proj_struct'](se3)
1782
+ feat = enc['gate'](phys, esm, se3)
1783
+ feat = self.add_positional_encoding(feat, pos_embed)
1784
+ feat = enc['encoder'](feat, src_key_padding_mask=~x_mask)
1785
+ return feat
1786
+
1787
+ def add_positional_encoding(self, x, pos_embed):
1788
+ """
1789
+ x: [B, L, D]
1790
+ pos_embed: [L_max, D]
1791
+ """
1792
+ B, L, D = x.shape
1793
+ pe = pos_embed[:L, :].unsqueeze(0).expand(B, -1, -1)
1794
+ return x + pe
1795
+
1796
+ # def _extract_cdr3_segment(self, tcr_feat, cdr3_start, cdr3_end):
1797
+ # B, L, D = tcr_feat.shape
1798
+ # device = tcr_feat.device
1799
+
1800
+ # max_len = (cdr3_end - cdr3_start + 1).max().item()
1801
+
1802
+ # # [max_len], 0..max_len-1
1803
+ # rel_idx = torch.arange(max_len, device=device).unsqueeze(0).expand(B, -1) # [B, max_len]
1804
+ # # absolute index = start + rel_idx
1805
+ # abs_idx = cdr3_start.unsqueeze(1) + rel_idx
1806
+ # # clamp end
1807
+ # abs_idx = abs_idx.clamp(0, L-1)
1808
+
1809
+ # # mask positions beyond end
1810
+ # mask = rel_idx <= (cdr3_end - cdr3_start).unsqueeze(1)
1811
+
1812
+ # # gather
1813
+ # # expand abs_idx to [B, max_len, D] for gather
1814
+ # gather_idx = abs_idx.unsqueeze(-1).expand(-1, -1, D)
1815
+ # out = torch.gather(tcr_feat, 1, gather_idx) # [B, max_len, D]
1816
+
1817
+ # return out, mask
1818
+
1819
+ def _extract_cdr3_segment(self, tcr_feat, cdr3_start, cdr3_end):
1820
+ """
1821
+ Extracts CDR3 embeddings and corresponding mask.
1822
+ tcr_feat: [B, L, D]
1823
+ cdr3_start, cdr3_end: [B]
1824
+ Returns:
1825
+ out: [B, max_len, D]
1826
+ mask: [B, max_len] (True = valid)
1827
+ """
1828
+ B, L, D = tcr_feat.shape
1829
+ device = tcr_feat.device
1830
+
1831
+ # 每个样本的 cdr3 长度
1832
+ lens = (cdr3_end - cdr3_start).clamp(min=0)
1833
+ max_len = lens.max().item()
1834
+
1835
+ rel_idx = torch.arange(max_len, device=device).unsqueeze(0).expand(B, -1) # [B, max_len]
1836
+ abs_idx = cdr3_start.unsqueeze(1) + rel_idx # [B, max_len]
1837
+
1838
+ # mask: True 表示有效
1839
+ mask = rel_idx < lens.unsqueeze(1) # 注意这里 "<" 就够了
1840
+
1841
+ # 将超出范围的索引设为 0(任意有效索引都行,因为会被mask掉)
1842
+ abs_idx = torch.where(mask, abs_idx, torch.zeros_like(abs_idx))
1843
+
1844
+ # gather
1845
+ gather_idx = abs_idx.unsqueeze(-1).expand(-1, -1, D)
1846
+ out = torch.gather(tcr_feat, 1, gather_idx)
1847
+
1848
+ # 对 mask 为 False 的位置强制置零,避免无效 token 参与计算
1849
+ out = out * mask.unsqueeze(-1)
1850
+
1851
+ return out, mask
1852
+
1853
+ def forward(self, batch):
1854
+ # TCRα / TCRβ
1855
+ tcra_feat = self.encode_tcr(
1856
+ batch['tcra_phys'].to(self.device, non_blocking=True),
1857
+ batch['tcra_esm'].to(self.device, non_blocking=True),
1858
+ batch['tcra_struct'].to(self.device, non_blocking=True),
1859
+ batch['tcra_coord'].to(self.device, non_blocking=True),
1860
+ batch['tcra_mask'].to(self.device, non_blocking=True),
1861
+ self.tcra_enc,
1862
+ self.tcra_pos_embed
1863
+ )
1864
+ tcrb_feat = self.encode_tcr(
1865
+ batch['tcrb_phys'].to(self.device, non_blocking=True),
1866
+ batch['tcrb_esm'].to(self.device, non_blocking=True),
1867
+ batch['tcrb_struct'].to(self.device, non_blocking=True),
1868
+ batch['tcrb_coord'].to(self.device, non_blocking=True),
1869
+ batch['tcrb_mask'].to(self.device, non_blocking=True),
1870
+ self.tcrb_enc,
1871
+ self.tcrb_pos_embed
1872
+ )
1873
+ # peptide
1874
+ pep_phys = self.proj_pep_phys(batch['pep_phys'].to(self.device, non_blocking=True))
1875
+ pep_esm = self.proj_pep_esm(batch['pep_esm'].to(self.device, non_blocking=True))
1876
+ pep_se3 = self.pep_se3(batch['pep_struct'].to(self.device, non_blocking=True), batch['pep_coord'].to(self.device, non_blocking=True), None)[0]
1877
+ pep_se3 = self.proj_pep_struct(pep_se3)
1878
+ pep_feat = self.pep_gate(pep_phys, pep_esm, pep_se3)
1879
+ pep_feat = self.add_positional_encoding(pep_feat, self.pep_pos_embed)
1880
+ pep_feat = self.pep_encoder(
1881
+ pep_feat,
1882
+ src_key_padding_mask=~batch['pep_mask'].to(self.device)
1883
+ )
1884
+ # hla
1885
+ hla_phys = self.proj_hla_phys(batch['hla_phys'].to(self.device, non_blocking=True))
1886
+ hla_esm = self.proj_hla_esm(batch['hla_esm'].to(self.device, non_blocking=True))
1887
+ hla_se3 = self.hla_se3(batch['hla_struct'].to(self.device, non_blocking=True), batch['hla_coord'].to(self.device, non_blocking=True), None)[0]
1888
+ hla_se3 = self.proj_hla_struct(hla_se3)
1889
+ hla_feat = self.hla_gate(hla_phys, hla_esm, hla_se3)
1890
+ hla_feat = self.add_positional_encoding(hla_feat, self.hla_pos_embed)
1891
+ hla_feat = self.hla_encoder(hla_feat)
1892
+
1893
+ if ('pep_feat_pretrain' in batch) and ('hla_feat_pretrain' in batch):
1894
+ pep_pretrain = batch['pep_feat_pretrain'].to(self.device, non_blocking=True)
1895
+ hla_pretrain = batch['hla_feat_pretrain'].to(self.device, non_blocking=True)
1896
+
1897
+ # ---- 鲁棒的长度对齐 (裁剪到最小长度) ----
1898
+ Lp = pep_feat.shape[1]
1899
+ Lp_pretrain = pep_pretrain.shape[1]
1900
+ if Lp != Lp_pretrain:
1901
+ Lp_min = min(Lp, Lp_pretrain)
1902
+ pep_feat = pep_feat[:, :Lp_min, :]
1903
+ pep_pretrain = pep_pretrain[:, :Lp_min, :]
1904
+
1905
+ Lh = hla_feat.shape[1]
1906
+ Lh_pretrain = hla_pretrain.shape[1]
1907
+ if Lh != Lh_pretrain:
1908
+ Lh_min = min(Lh, Lh_pretrain)
1909
+ hla_feat = hla_feat[:, :Lh_min, :]
1910
+ hla_pretrain = hla_pretrain[:, :Lh_min, :]
1911
+
1912
+ # ---- Peptide gating ----
1913
+ pep_feat = self.pep_gate_2(pep_feat, pep_pretrain)
1914
+ # ---- HLA gating ----
1915
+ hla_feat = self.hla_gate_2(hla_feat, hla_pretrain)
1916
+
1917
+ # TCRα CDR3 segment
1918
+ tcra_cdr3, cdr3a_mask = self._extract_cdr3_segment(
1919
+ tcra_feat,
1920
+ batch['cdr3a_start'].to(self.device, non_blocking=True),
1921
+ batch['cdr3a_end'].to(self.device, non_blocking=True)
1922
+ )
1923
+
1924
+ # TCRβ CDR3 segment
1925
+ tcrb_cdr3, cdr3b_mask = self._extract_cdr3_segment(
1926
+ tcrb_feat,
1927
+ batch['cdr3b_start'].to(self.device, non_blocking=True),
1928
+ batch['cdr3b_end'].to(self.device, non_blocking=True)
1929
+ )
1930
+
1931
+ # TCRα CDR3 ← Peptide
1932
+ tcra_cdr3_cross, _ = self.cross_attn_tcra_pep(
1933
+ query=tcra_cdr3, # [B, La_cdr3, D]
1934
+ key=pep_feat, value=pep_feat, # [B, Lp, D]
1935
+ key_padding_mask=~batch['pep_mask'].to(self.device)
1936
+ )
1937
+ tcra_cdr3 = self.norm_tcra_pep(tcra_cdr3 + tcra_cdr3_cross)
1938
+ # 重新掩蔽 padding 的 CDR3 位置,防止无效 token 漏光
1939
+ tcra_cdr3 = tcra_cdr3 * cdr3a_mask.unsqueeze(-1)
1940
+
1941
+ # TCRβ CDR3 ← Peptide
1942
+ tcrb_cdr3_cross, _ = self.cross_attn_tcrb_pep(
1943
+ query=tcrb_cdr3,
1944
+ key=pep_feat, value=pep_feat,
1945
+ key_padding_mask=~batch['pep_mask'].to(self.device)
1946
+ )
1947
+ tcrb_cdr3 = self.norm_tcrb_pep(tcrb_cdr3 + tcrb_cdr3_cross)
1948
+ tcrb_cdr3 = tcrb_cdr3 * cdr3b_mask.unsqueeze(-1)
1949
+
1950
+ # ------------------ Cross-Attn:TCR 全序列 ↔ HLA(整条 TCR) ------------------
1951
+ # TCRα full ← HLA
1952
+ tcra_hla_cross, _ = self.cross_attn_tcra_hla(
1953
+ query=tcra_feat, # [B, La, D]
1954
+ key=hla_feat, value=hla_feat, # [B, Lh, D]
1955
+ key_padding_mask=None
1956
+ )
1957
+ tcra_feat = self.norm_tcra_hla(tcra_feat + tcra_hla_cross)
1958
+ tcra_feat = tcra_feat * batch['tcra_mask'].to(self.device).unsqueeze(-1)
1959
+
1960
+ # TCRβ full ← HLA
1961
+ tcrb_hla_cross, _ = self.cross_attn_tcrb_hla(
1962
+ query=tcrb_feat,
1963
+ key=hla_feat, value=hla_feat,
1964
+ key_padding_mask=None
1965
+ )
1966
+ tcrb_feat = self.norm_tcrb_hla(tcrb_feat + tcrb_hla_cross)
1967
+ tcrb_feat = tcrb_feat * batch['tcrb_mask'].to(self.device).unsqueeze(-1)
1968
+
1969
+ # bilinear fusion
1970
+ vec_tcra_pep, attn_tcra_pep = self.bi_tcra_pep(tcra_cdr3, pep_feat, v_mask=cdr3a_mask, q_mask=batch['pep_mask'].to(self.device))
1971
+ vec_tcrb_pep, attn_tcrb_pep = self.bi_tcrb_pep(tcrb_cdr3, pep_feat, v_mask=cdr3b_mask, q_mask=batch['pep_mask'].to(self.device))
1972
+ vec_tcra_hla, attn_tcra_hla = self.bi_tcra_hla(tcra_feat, hla_feat, v_mask=batch['tcra_mask'].to(self.device), q_mask=None)
1973
+ vec_tcrb_hla, attn_tcrb_hla = self.bi_tcrb_hla(tcrb_feat, hla_feat, v_mask=batch['tcrb_mask'].to(self.device), q_mask=None)
1974
+
1975
+ attn_tcra_pep_small = attn_tcra_pep.sum(dim=1).float()
1976
+ attn_tcrb_pep_small = attn_tcrb_pep.sum(dim=1).float()
1977
+ attn_tcra_hla_small = attn_tcra_hla.sum(dim=1).float()
1978
+ attn_tcrb_hla_small = attn_tcrb_hla.sum(dim=1).float()
1979
+
1980
+ attn_dict = {
1981
+ 'attn_tcra_pep': attn_tcra_pep_small.detach().cpu().numpy(),
1982
+ 'attn_tcrb_pep': attn_tcrb_pep_small.detach().cpu().numpy(),
1983
+ 'attn_tcra_hla': attn_tcra_hla_small.detach().cpu().numpy(),
1984
+ 'attn_tcrb_hla': attn_tcrb_hla_small.detach().cpu().numpy()
1985
+ }
1986
+
1987
+ fused = torch.cat([vec_tcra_pep, vec_tcrb_pep, vec_tcra_hla, vec_tcrb_hla], dim=-1)
1988
+ logits = self.head(fused).squeeze(-1)
1989
+
1990
+ labels = batch['label'].to(self.device)
1991
+ loss_binding = self.loss_fn(logits, labels.float())
1992
+
1993
+ probs = torch.sigmoid(logits)
1994
+
1995
+ return probs, loss_binding, pep_feat.detach().cpu().numpy(), attn_dict
src/phla_cache/hla_coord_dict.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad3f796d0193cb85fa7786581064df26bf50bc75a362fd4687b711af5d65738a
3
+ size 381645
src/phla_cache/hla_feat_dict.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08abc4aff2c1afcf26829a14f9e44081f70b70e6aad52f69be39b0d055e6fd87
3
+ size 1878067
src/physicochemical.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from typing import List, Optional
4
+ import numpy as np
5
+ from sklearn.preprocessing import StandardScaler
6
+
7
+ # class PhysicochemicalEncoder(nn.Module):
8
+ # """Amino Acid Physicochemical Property Encoder (AAindex版本)"""
9
+
10
+ # def __init__(self, device, use_aaindex=True, selected_features=None):
11
+ # """
12
+ # Args:
13
+ # device: torch device
14
+ # use_aaindex: 是否使用AAindex特征(True)还是简单的5特征(False)
15
+ # selected_features: 选择使用哪些AAindex特征(None=使用全部)
16
+ # """
17
+ # super().__init__()
18
+ # self.device = device
19
+ # self.use_aaindex = use_aaindex
20
+
21
+ # if use_aaindex:
22
+ # # 从AAindex加载特征
23
+ # self.aa_properties, self.feature_names = self._load_aaindex_features(selected_features)
24
+ # self.n_features = len(list(self.aa_properties['A'].values()))
25
+ # print(f"✓ Loaded {self.n_features} AAindex features")
26
+ # else:
27
+ # # 使用简单的5特征
28
+ # self.aa_properties = self._get_basic_properties()
29
+ # self.n_features = 5
30
+ # print(f"✓ Using {self.n_features} basic features")
31
+
32
+ # # 标准化(重要!不同特征范围差异大)
33
+ # self.scaler = self._fit_scaler()
34
+
35
+ # def _load_aaindex_features(self, selected_features=None):
36
+ # """从AAindex加载特征"""
37
+ # try:
38
+ # # 尝试导入生成的文件
39
+ # from aa_properties_aaindex import AA_PROPERTIES_AAINDEX, FEATURE_DESCRIPTIONS
40
+
41
+ # if selected_features is not None:
42
+ # # 只选择指定的特征
43
+ # filtered_props = {}
44
+ # for aa, props in AA_PROPERTIES_AAINDEX.items():
45
+ # filtered_props[aa] = {k: v for k, v in props.items()
46
+ # if k in selected_features}
47
+ # return filtered_props, selected_features
48
+ # else:
49
+ # # 使用所有特征
50
+ # feature_names = list(AA_PROPERTIES_AAINDEX['A'].keys())
51
+ # return AA_PROPERTIES_AAINDEX, feature_names
52
+
53
+ # except ImportError:
54
+ # print("⚠ Warning: aa_properties_aaindex.py not found!")
55
+ # print(" Falling back to basic 5 features")
56
+ # print(" Run 'python aaindex_downloader.py' to download AAindex features")
57
+ # return self._get_basic_properties(), ['hydro', 'charge', 'volume', 'flex', 'aroma']
58
+
59
+ # def _get_basic_properties(self):
60
+ # """基础的5特征(作为fallback)"""
61
+ # return {
62
+ # 'A': [1.8, 0.0, 88.6, 0.36, 0.0],
63
+ # 'C': [2.5, 0.0, 108.5, 0.35, 0.0],
64
+ # 'D': [-3.5, -1.0, 111.1, 0.51, 0.0],
65
+ # 'E': [-3.5, -1.0, 138.4, 0.50, 0.0],
66
+ # 'F': [2.8, 0.0, 189.9, 0.31, 1.0],
67
+ # 'G': [-0.4, 0.0, 60.1, 0.54, 0.0],
68
+ # 'H': [-3.2, 0.5, 153.2, 0.32, 0.5],
69
+ # 'I': [4.5, 0.0, 166.7, 0.46, 0.0],
70
+ # 'K': [-3.9, 1.0, 168.6, 0.47, 0.0],
71
+ # 'L': [3.8, 0.0, 166.7, 0.37, 0.0],
72
+ # 'M': [1.9, 0.0, 162.9, 0.30, 0.0],
73
+ # 'N': [-3.5, 0.0, 114.1, 0.46, 0.0],
74
+ # 'P': [-1.6, 0.0, 112.7, 0.51, 0.0],
75
+ # 'Q': [-3.5, 0.0, 143.8, 0.49, 0.0],
76
+ # 'R': [-4.5, 1.0, 173.4, 0.53, 0.0],
77
+ # 'S': [-0.8, 0.0, 89.0, 0.51, 0.0],
78
+ # 'T': [-0.7, 0.0, 116.1, 0.44, 0.0],
79
+ # 'V': [4.2, 0.0, 140.0, 0.39, 0.0],
80
+ # 'W': [-0.9, 0.0, 227.8, 0.31, 1.0],
81
+ # 'Y': [-1.3, 0.0, 193.6, 0.42, 1.0],
82
+ # 'X': [0.0, 0.0, 120.0, 0.40, 0.0],
83
+ # }
84
+
85
+ # def _fit_scaler(self):
86
+ # """拟合标准化器"""
87
+ # # 收集所有氨基酸的特征
88
+ # all_features = []
89
+ # for aa in 'ARNDCQEGHILKMFPSTWYV': # 20种标准氨基酸
90
+ # if isinstance(self.aa_properties[aa], dict):
91
+ # # AAindex格式
92
+ # features = list(self.aa_properties[aa].values())
93
+ # else:
94
+ # # 列表格式
95
+ # features = self.aa_properties[aa]
96
+ # all_features.append(features)
97
+
98
+ # all_features = np.array(all_features)
99
+
100
+ # # Z-score标准化
101
+ # scaler = StandardScaler()
102
+ # scaler.fit(all_features)
103
+
104
+ # return scaler
105
+
106
+ # def _get_aa_features(self, aa: str) -> List[float]:
107
+ # """获取单个氨基酸的特征"""
108
+ # aa = aa.upper()
109
+ # if aa not in self.aa_properties:
110
+ # aa = 'X' # Unknown
111
+
112
+ # if isinstance(self.aa_properties[aa], dict):
113
+ # # AAindex格式:字典
114
+ # features = list(self.aa_properties[aa].values())
115
+ # else:
116
+ # # 基础格式:列表
117
+ # features = self.aa_properties[aa]
118
+
119
+ # return features
120
+
121
+ # def forward(self, sequences: List[str]) -> torch.Tensor:
122
+ # """
123
+ # Args:
124
+ # sequences: List of amino acid sequences
125
+ # Returns:
126
+ # [B, max_len, n_features] 标准化后的特征
127
+ # """
128
+ # batch_size = len(sequences)
129
+ # max_len = max(len(seq) for seq in sequences)
130
+
131
+ # # 收集特征
132
+ # properties = []
133
+ # for seq in sequences:
134
+ # seq_props = []
135
+ # for aa in seq:
136
+ # props = self._get_aa_features(aa)
137
+ # seq_props.append(props)
138
+
139
+ # # Padding
140
+ # while len(seq_props) < max_len:
141
+ # seq_props.append([0.0] * self.n_features)
142
+
143
+ # properties.append(seq_props)
144
+
145
+ # properties = np.array(properties) # [B, L, n_features]
146
+
147
+ # # 标准化(除了padding位置)
148
+ # batch_size, seq_len, n_feat = properties.shape
149
+ # properties_flat = properties.reshape(-1, n_feat)
150
+
151
+ # # 标准化
152
+ # properties_normalized = self.scaler.transform(properties_flat)
153
+ # properties_normalized = properties_normalized.reshape(batch_size, seq_len, n_feat)
154
+
155
+ # # 转为tensor
156
+ # properties_tensor = torch.tensor(
157
+ # properties_normalized,
158
+ # dtype=torch.float32,
159
+ # device=self.device
160
+ # )
161
+
162
+ # return properties_tensor # [B, L, n_features]
163
+
164
+ import torch
165
+ import torch.nn as nn
166
+ import numpy as np
167
+ from sklearn.preprocessing import StandardScaler
168
+ from typing import List
169
+
170
+ class PhysicochemicalEncoder(nn.Module):
171
+ """Amino Acid Physicochemical Property Encoder (AAindex版本, 向量化优化版)"""
172
+
173
+ def __init__(self, device, use_aaindex=True, selected_features=None):
174
+ super().__init__()
175
+ self.device = device
176
+ self.use_aaindex = use_aaindex
177
+
178
+ # 加载特征
179
+ if use_aaindex:
180
+ self.aa_properties, self.feature_names = self._load_aaindex_features(selected_features)
181
+ self.n_features = len(list(self.aa_properties['A'].values()))
182
+ print(f"✓ Loaded {self.n_features} AAindex features")
183
+ else:
184
+ self.aa_properties = self._get_basic_properties()
185
+ self.n_features = 5
186
+ print(f"✓ Using {self.n_features} basic features")
187
+
188
+ # 拟合标准化器
189
+ self.scaler = self._fit_scaler()
190
+
191
+ # ======================== 🔥 预处理部分 ======================== #
192
+ # 1. 构建 lookup table
193
+ aa_list = list(self.aa_properties.keys())
194
+ aa_list.sort() # 保证稳定顺序
195
+ self.aa_to_idx = {aa: i for i, aa in enumerate(aa_list)}
196
+ self.pad_idx = len(self.aa_to_idx) # padding index
197
+
198
+ aa_feature_table = []
199
+ for aa in aa_list:
200
+ feats = self._get_aa_features(aa)
201
+ aa_feature_table.append(feats)
202
+ aa_feature_table.append([0.0] * self.n_features) # padding vector
203
+ self.aa_feature_table = torch.tensor(
204
+ np.array(aa_feature_table),
205
+ dtype=torch.float32
206
+ ).to(self.device) # [n_aa+1, n_feat]
207
+
208
+ # 2. 标准化参数预存成 GPU tensor
209
+ self.mean_tensor = torch.tensor(self.scaler.mean_, dtype=torch.float32, device=self.device)
210
+ self.scale_tensor = torch.tensor(self.scaler.scale_, dtype=torch.float32, device=self.device)
211
+
212
+ # 下面这些函数和你原来的完全一致,不动
213
+ def _load_aaindex_features(self, selected_features=None):
214
+ try:
215
+ from aa_properties_aaindex import AA_PROPERTIES_AAINDEX, FEATURE_DESCRIPTIONS
216
+ if selected_features is not None:
217
+ filtered_props = {}
218
+ for aa, props in AA_PROPERTIES_AAINDEX.items():
219
+ filtered_props[aa] = {k: v for k, v in props.items() if k in selected_features}
220
+ return filtered_props, selected_features
221
+ else:
222
+ feature_names = list(AA_PROPERTIES_AAINDEX['A'].keys())
223
+ return AA_PROPERTIES_AAINDEX, feature_names
224
+ except ImportError:
225
+ print("⚠ Warning: aa_properties_aaindex.py not found!")
226
+ return self._get_basic_properties(), ['hydro', 'charge', 'volume', 'flex', 'aroma']
227
+
228
+ def _get_basic_properties(self):
229
+ # 这里同你原来的
230
+ return {
231
+ 'A': [1.8, 0.0, 88.6, 0.36, 0.0],
232
+ 'C': [2.5, 0.0, 108.5, 0.35, 0.0],
233
+ 'D': [-3.5, -1.0, 111.1, 0.51, 0.0],
234
+ 'E': [-3.5, -1.0, 138.4, 0.50, 0.0],
235
+ 'F': [2.8, 0.0, 189.9, 0.31, 1.0],
236
+ 'G': [-0.4, 0.0, 60.1, 0.54, 0.0],
237
+ 'H': [-3.2, 0.5, 153.2, 0.32, 0.5],
238
+ 'I': [4.5, 0.0, 166.7, 0.46, 0.0],
239
+ 'K': [-3.9, 1.0, 168.6, 0.47, 0.0],
240
+ 'L': [3.8, 0.0, 166.7, 0.37, 0.0],
241
+ 'M': [1.9, 0.0, 162.9, 0.30, 0.0],
242
+ 'N': [-3.5, 0.0, 114.1, 0.46, 0.0],
243
+ 'P': [-1.6, 0.0, 112.7, 0.51, 0.0],
244
+ 'Q': [-3.5, 0.0, 143.8, 0.49, 0.0],
245
+ 'R': [-4.5, 1.0, 173.4, 0.53, 0.0],
246
+ 'S': [-0.8, 0.0, 89.0, 0.51, 0.0],
247
+ 'T': [-0.7, 0.0, 116.1, 0.44, 0.0],
248
+ 'V': [4.2, 0.0, 140.0, 0.39, 0.0],
249
+ 'W': [-0.9, 0.0, 227.8, 0.31, 1.0],
250
+ 'Y': [-1.3, 0.0, 193.6, 0.42, 1.0],
251
+ 'X': [0.0, 0.0, 120.0, 0.40, 0.0],
252
+ }
253
+
254
+ def _fit_scaler(self):
255
+ all_features = []
256
+ for aa in 'ARNDCQEGHILKMFPSTWYV':
257
+ if isinstance(self.aa_properties[aa], dict):
258
+ features = list(self.aa_properties[aa].values())
259
+ else:
260
+ features = self.aa_properties[aa]
261
+ all_features.append(features)
262
+ all_features = np.array(all_features)
263
+ scaler = StandardScaler()
264
+ scaler.fit(all_features)
265
+ return scaler
266
+
267
+ def _get_aa_features(self, aa: str):
268
+ aa = aa.upper()
269
+ if aa not in self.aa_properties:
270
+ aa = 'X'
271
+ if isinstance(self.aa_properties[aa], dict):
272
+ return list(self.aa_properties[aa].values())
273
+ else:
274
+ return self.aa_properties[aa]
275
+
276
+ def forward(self, sequences: List[str]) -> torch.Tensor:
277
+ batch_size = len(sequences)
278
+ max_len = max(len(seq) for seq in sequences)
279
+
280
+ # 1) encode sequences to indices with padding
281
+ idx_batch = np.full((batch_size, max_len), self.pad_idx, dtype=np.int64)
282
+ for i, seq in enumerate(sequences):
283
+ idx_seq = [self.aa_to_idx.get(aa.upper(), self.pad_idx) for aa in seq]
284
+ idx_batch[i, :len(idx_seq)] = idx_seq
285
+
286
+ idx_tensor = torch.tensor(idx_batch, dtype=torch.long, device=self.device) # [B, L]
287
+
288
+ # 2) lookup properties
289
+ props = self.aa_feature_table[idx_tensor] # [B, L, n_feat]
290
+
291
+ props = (props - self.mean_tensor) / self.scale_tensor
292
+
293
+ return props
src/predictor.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from main import StriMap_pHLA, StriMap_TCRpHLA, load_test_data
3
+
4
+ def load_model(model_path="model.pt", device=None):
5
+ model = StriMap_pHLA(
6
+ device=device,
7
+ model_save_path=model_path,
8
+ cache_save=False,
9
+ )
10
+ model.load_model(model_path)
11
+ return model, device
12
+
13
+ def predict_from_df(df, model):
14
+ df = load_test_data(
15
+ df_test=df,
16
+ hla_dict_path='HLA_dict.npy',
17
+ )
18
+ model.prepare_embeddings(
19
+ df,
20
+ force_recompute=False,
21
+ )
22
+ df['label'] = 1
23
+ torch.cuda.empty_cache()
24
+ predictions, _ = model.predict(df, batch_size=128, return_probs=True, use_kfold=False)
25
+ df["Prediction"] = predictions
26
+ # remove label
27
+ df = df.drop(columns=['label'])
28
+ return df
src/streamlit_app.py CHANGED
@@ -1,40 +1,59 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ from io import StringIO
4
+ from predictor import load_model, predict_from_df
5
+ from Bio import SeqIO
6
+ import torch
7
+
8
+ st.set_page_config(page_title="🧬 Peptide–HLA Binding Predictor", layout="wide")
9
+
10
+ st.title("🧠 Peptide–HLA Binding Predictor")
11
+ st.markdown("""
12
+ Upload a **CSV** file with columns `Peptide` and `HLA`,
13
+ or a **FASTA** file containing peptide sequences (headers optionally include HLA type).
14
+ """)
15
+
16
+ uploaded_file = st.file_uploader("Upload CSV or FASTA", type=["csv", "fasta"])
17
+
18
+ # 加载模型
19
+ @st.cache_resource
20
+ def get_model():
21
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
+ model, device = load_model("model.pt", device=device)
23
+ return model, device
24
+
25
+ model, device = get_model()
26
+
27
+ if uploaded_file:
28
+ if uploaded_file.name.endswith(".csv"):
29
+ df = pd.read_csv(uploaded_file)
30
+ else:
31
+ seqs = []
32
+ for rec in SeqIO.parse(uploaded_file, "fasta"):
33
+ header = rec.id
34
+ seq = str(rec.seq)
35
+ # 尝试从header提取HLA,比如 ">HLA-A*02:01|SLLMWITQC"
36
+ if "|" in header:
37
+ hla, _ = header.split("|", 1)
38
+ else:
39
+ hla = "HLA-Unknown"
40
+ seqs.append([seq, hla])
41
+ df = pd.DataFrame(seqs, columns=["Peptide", "HLA"])
42
+
43
+ st.write("✅ Uploaded data preview:")
44
+ st.dataframe(df.head())
45
+
46
+ if st.button("🚀 Run Prediction"):
47
+ with st.spinner("Running model inference..."):
48
+ result_df = predict_from_df(df, model)
49
+
50
+ st.success("✅ Prediction complete!")
51
+ st.dataframe(result_df.head(10))
52
+
53
+ csv = result_df.to_csv(index=False).encode("utf-8")
54
+ st.download_button(
55
+ "⬇️ Download results as CSV",
56
+ data=csv,
57
+ file_name="hla_binding_predictions.csv",
58
+ mime="text/csv",
59
+ )
src/streamlit_app0.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ """
7
+ # Welcome to Streamlit!
8
+
9
+ Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
+ forums](https://discuss.streamlit.io).
12
+
13
+ In the meantime, below is an example of what you can do with just a few lines of code:
14
+ """
15
+
16
+ num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
+ num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
+
19
+ indices = np.linspace(0, 1, num_points)
20
+ theta = 2 * np.pi * num_turns * indices
21
+ radius = indices
22
+
23
+ x = radius * np.cos(theta)
24
+ y = radius * np.sin(theta)
25
+
26
+ df = pd.DataFrame({
27
+ "x": x,
28
+ "y": y,
29
+ "idx": indices,
30
+ "rand": np.random.randn(num_points),
31
+ })
32
+
33
+ st.altair_chart(alt.Chart(df, height=700, width=700)
34
+ .mark_point(filled=True)
35
+ .encode(
36
+ x=alt.X("x", axis=None),
37
+ y=alt.Y("y", axis=None),
38
+ color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
+ size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
+ ))