marisming commited on
Commit
d6e5490
·
verified ·
1 Parent(s): b6896e3

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -45,3 +45,4 @@ train_data/ja_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
45
  train_data/ko_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
46
  train_data/protein_4g.txt filter=lfs diff=lfs merge=lfs -text
47
  train_data/zh_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
 
 
45
  train_data/ko_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
46
  train_data/protein_4g.txt filter=lfs diff=lfs merge=lfs -text
47
  train_data/zh_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
48
+ get_data/central_dogma.csv filter=lfs diff=lfs merge=lfs -text
get_data/central_dogma.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdf03ee452e252ac282c561121a06cff89c80544f34c03a77a8558b46c14e228
3
+ size 42416200
get_data/convert_lucaone_data.ipynb ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "f3ad096d-492a-4da2-a390-03c7e7453821",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
+ " from .autonotebook import tqdm as notebook_tqdm\n",
15
+ "Generating train split: 25600 examples [00:00, 82207.06 examples/s]\n"
16
+ ]
17
+ },
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "DatasetDict({\n",
22
+ " train: Dataset({\n",
23
+ " features: ['seq_id_a', 'seq_id_b', 'seq_type_a', 'seq_type_b', 'seq_a', 'seq_b', 'label'],\n",
24
+ " num_rows: 25600\n",
25
+ " })\n",
26
+ "})"
27
+ ]
28
+ },
29
+ "execution_count": 1,
30
+ "metadata": {},
31
+ "output_type": "execute_result"
32
+ }
33
+ ],
34
+ "source": [
35
+ "from datasets import load_dataset\n",
36
+ "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
37
+ "from transformers import Trainer\n",
38
+ "import evaluate\n",
39
+ "import numpy as np\n",
40
+ "from transformers import TrainingArguments\n",
41
+ "from transformers import AutoModelForSequenceClassification\n",
42
+ "\n",
43
+ "raw_datasets = load_dataset('csv', data_files='central_dogma.csv')\n",
44
+ "raw_datasets"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 3,
50
+ "id": "8d89ca1d-1968-43d3-8d47-f9f87b02cd02",
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "data": {
55
+ "text/plain": [
56
+ "'LKIELASSYGFCFGVKRAIKIAENAGDAATIGPLIHNNEEINRLATNFNVKTLHGINELKDEKKAIIRTHGITKSDLAELKKTDIKVIDATCPFVTKPQQICEDMSNAGYDVVIFGDENHPEVKGVKSYASGKVYVVLDESELEGVKFRQKVALVSQTTRKVEKFMQIANYLMLRVKEVRVFNTICNATFENQEAVKNLAKRADVMIVIGGKNSSNTKQLYLISKNFCEDSYLIESEHEVEKSWFEGKNLCGISAGASTPDWIIQKVVDAIEKF*'"
57
+ ]
58
+ },
59
+ "execution_count": 3,
60
+ "metadata": {},
61
+ "output_type": "execute_result"
62
+ }
63
+ ],
64
+ "source": [
65
+ "from Bio.Seq import Seq\n",
66
+ "\n",
67
+ "def translate_dna_to_protein_biopython(dna_sequence):\n",
68
+ " \"\"\"Translate a DNA sequence into its corresponding protein sequence using Biopython.\"\"\"\n",
69
+ " # 确保输入的是大写的DNA序列\n",
70
+ " dna_seq = Seq(dna_sequence.upper())\n",
71
+ " \n",
72
+ " # 使用Biopython内置方法进行翻译\n",
73
+ " protein_seq = dna_seq.translate(to_stop=False) # 如果需要在终止密码子处停止,请设置to_stop=True\n",
74
+ " \n",
75
+ " return str(protein_seq)\n",
76
+ "\n",
77
+ "def trim_sequence(sequence, front, length):\n",
78
+ " \"\"\"Trim the specified number of characters from the start and end of a string.\"\"\"\n",
79
+ " # if len(sequence) <= front + back:\n",
80
+ " # raise ValueError(\"The sequence is too short to trim the specified number of characters.\")\n",
81
+ " # return sequence[front:-back] if back > 0 else sequence[front:]\n",
82
+ " return sequence[front:front+length]\n",
83
+ "\n",
84
+ "translate_dna_to_protein_biopython(\"TTGAAGATTGAGCTTGCTAGCAGCTACGGCTTTTGCTTTGGGGTAAAGCGCGCCATAAAGATAGCCGAAAATGCGGGCGATGCCGCTACTATCGGGCCTCTCATACATAATAACGAAGAGATAAACCGCCTGGCTACGAATTTCAATGTCAAGACCCTCCACGGCATAAATGAGCTAAAGGACGAGAAAAAGGCCATCATACGCACTCACGGTATCACAAAAAGCGATCTGGCCGAGCTTAAAAAGACCGATATCAAAGTCATAGACGCCACTTGCCCGTTCGTGACCAAGCCGCAGCAAATTTGCGAGGATATGAGCAACGCAGGATACGATGTCGTGATATTTGGCGATGAAAATCATCCCGAAGTCAAAGGAGTGAAGTCCTATGCCAGCGGAAAGGTTTATGTCGTGCTCGATGAGAGCGAGCTTGAGGGAGTGAAATTTAGACAAAAGGTAGCACTCGTCAGTCAAACGACGCGCAAAGTCGAAAAATTTATGCAAATAGCGAACTACTTGATGCTACGCGTCAAAGAGGTGCGAGTTTTCAACACTATCTGCAACGCGACCTTCGAGAATCAGGAGGCGGTCAAAAATTTAGCCAAAAGAGCCGATGTGATGATAGTCATCGGTGGTAAAAATAGCTCTAATACAAAGCAGCTTTATCTGATATCTAAAAATTTCTGCGAGGACAGCTACCTGATAGAGAGCGAACACGAAGTCGAGAAAAGCTGGTTTGAAGGCAAGAATTTATGCGGTATAAGTGCGGGAGCGAGCACGCCTGATTGGATCATACAAAAAGTCGTCGACGCGATAGAGAAATTTTAA\")"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 11,
90
+ "id": "682a14ef-7492-434e-bed0-af0f5c03db86",
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "name": "stderr",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/Bio/Seq.py:2879: BiopythonWarning: Partial codon, len(sequence) not a multiple of three. Explicitly trim the sequence or add trailing N before translation. This may become an error in future.\n",
98
+ " warnings.warn(\n"
99
+ ]
100
+ }
101
+ ],
102
+ "source": [
103
+ "#获得完全匹配的正例\n",
104
+ "not_match_list = []\n",
105
+ "pos_select_list = [] #正例\n",
106
+ "neg_select_list = [] #负例\n",
107
+ "\n",
108
+ "for item in raw_datasets[\"train\"]:\n",
109
+ " example_dna = item[\"seq_a\"]\n",
110
+ " example_protein = item[\"seq_b\"]\n",
111
+ " label = item[\"label\"]\n",
112
+ "\n",
113
+ " if 0==label: #负例都要\n",
114
+ " neg_select_list.append(item)\n",
115
+ " \n",
116
+ " dna_length = len(example_protein)*3\n",
117
+ " trimmed_sequence = trim_sequence(example_dna,100,dna_length)\n",
118
+ " protein_trans = translate_dna_to_protein_biopython(trimmed_sequence)\n",
119
+ "\n",
120
+ " if 1==label:\n",
121
+ " if protein_trans[1:-2]!=example_protein[1:-2]: #运行有前后1个字符不一样的\n",
122
+ " not_match_list.append(item)\n",
123
+ " else:\n",
124
+ " pos_select_list.append(item)"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 12,
130
+ "id": "9abb1e66-5d3a-4d87-9bf3-577cec8efcf9",
131
+ "metadata": {},
132
+ "outputs": [
133
+ {
134
+ "name": "stdout",
135
+ "output_type": "stream",
136
+ "text": [
137
+ "最长的蛋白质序列: MARRPLVMGNWKLNGSKAFTKELITGLKDELNAVSGCDVAIAPPVMYLAEAETALVSSDIALGTQNVDLNKQGAFTGDISTEMLKDFGVKYVIIGHSERRQYHHESDEFIAKKFGVLKDAGLVPVLCIGESEAENEAGKTEEVCARQIDAVMNTLGVEAFNGAVIAYEPIWAIGTGKSATPAQAQAVHAFIRGHIAKQSQAVAERVIIQYGGSVNDANAAELFTQPDIDGALVGGASLKASAFAVIVKAAAKAKN\n"
138
+ ]
139
+ }
140
+ ],
141
+ "source": [
142
+ "from Bio.Seq import Seq\n",
143
+ "\n",
144
+ "def find_longest_orf(dna_sequence_str):\n",
145
+ " dna_sequence = Seq(dna_sequence_str)\n",
146
+ " # 定义可能的终止密码子\n",
147
+ " stop_codons = ['TAA', 'TAG', 'TGA']\n",
148
+ " \n",
149
+ " # 初始化最长ORF和其长度\n",
150
+ " longest_orf = ''\n",
151
+ " longest_length = 0\n",
152
+ " \n",
153
+ " # 遍历正向链的三个阅读框\n",
154
+ " for frame in range(3):\n",
155
+ " seq = dna_sequence[frame:]\n",
156
+ " start = None\n",
157
+ " for i in range(0, len(seq) - 2, 3):\n",
158
+ " codon = seq[i:i+3]\n",
159
+ " if codon == 'ATG' and start is None:\n",
160
+ " start = i\n",
161
+ " if codon in stop_codons and start is not None:\n",
162
+ " orf = seq[start:i+3]\n",
163
+ " if len(orf) > longest_length:\n",
164
+ " longest_orf = orf\n",
165
+ " longest_length = len(orf)\n",
166
+ " start = None\n",
167
+ " \n",
168
+ " # 遍历反向互补链的三个阅读框\n",
169
+ " rev_seq = dna_sequence.reverse_complement()\n",
170
+ " for frame in range(3):\n",
171
+ " seq = rev_seq[frame:]\n",
172
+ " start = None\n",
173
+ " for i in range(0, len(seq) - 2, 3):\n",
174
+ " codon = seq[i:i+3]\n",
175
+ " if codon == 'ATG' and start is None:\n",
176
+ " start = i\n",
177
+ " if codon in stop_codons and start is not None:\n",
178
+ " orf = seq[start:i+3]\n",
179
+ " if len(orf) > longest_length:\n",
180
+ " longest_orf = orf\n",
181
+ " longest_length = len(orf)\n",
182
+ " start = None\n",
183
+ " \n",
184
+ " # 翻译最长ORF,去除终止符号\n",
185
+ " if longest_orf:\n",
186
+ " protein_sequence = longest_orf.translate(to_stop=True)\n",
187
+ " return str(protein_sequence)\n",
188
+ " else:\n",
189
+ " return \"-\"\n",
190
+ "\n",
191
+ "# 示例DNA序列\n",
192
+ "dna_sequence = \"TCAGTTTTTTGCTTTCGCCGCCGCTTTAACAATGACAGCGAACGCTGACGCTTTTAACGATGCACCACCAACTAATGCACCATCAATATCAGGCTGAGTAAACAATTCTGCTGCATTTGCATCATTGACGGAACCGCCATATTGAATAATTACCCGTTCAGCAACGGCTTGGCTTTGTTTTGCAATATGACCTCGAATAAAGGCATGTACTGCTTGTGCTTGAGCTGGAGTCGCCGATTTACCTGTACCGATAGCCCAAATCGGTTCATAAGCGATTACTGCACCGTTAAATGCTTCAACACCTAGTGTATTCATCACCGCATCAATTTGACGTGCACAAACCTCTTCCGTTTTGCCTGCTTCATTTTCAGCTTCGCTTTCACCGATACATAATACAGGAACTAAACCAGCATCTTTTAACACACCAAATTTTTTCGCAATAAATTCATCACTTTCATGATGATATTGACGTCGCTCAGAATGACCGATAATGACATATTTTACACCAAAGTCTTTTAACATTTCTGTTGAAATATCACCGGTAAATGCACCTTGTTTGTTTAAATCAACATTTTGAGTACCTAAAGCAATATCACTGCTGACCAGTGCAGTTTCAGCTTCCGCTAAATACATGACAGGCGGTGCAATTGCCACATCACAGCCTGACACCGCATTAAGTTCATCTTTTAAACCGGTAATAAGTTCTTTTGTAAAGGCTTTACTACCATTTAATTTCCAGTTACCCATGACTAAAGGACGACGAGCCAT\"\n",
193
+ "\n",
194
+ "# 获取最长的蛋白质序列\n",
195
+ "protein_sequence = find_longest_orf(dna_sequence)\n",
196
+ "print(\"最长的蛋白质序列:\", protein_sequence)\n"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 13,
202
+ "id": "f42457cf-f946-46e5-9d31-b3c8947c0182",
203
+ "metadata": {},
204
+ "outputs": [],
205
+ "source": [
206
+ "#获得ORF完全匹配的\n",
207
+ "not_match_list1 = []\n",
208
+ "for item in not_match_list:\n",
209
+ " example_dna = item[\"seq_a\"]\n",
210
+ " example_protein = item[\"seq_b\"]\n",
211
+ " label = item[\"label\"]\n",
212
+ " \n",
213
+ " protein_trans = find_longest_orf(example_dna)\n",
214
+ "\n",
215
+ " if 1==label:\n",
216
+ " if example_protein.find(protein_trans[1:-2])==-1:#包含即可,前后可以相差几个字母\n",
217
+ " not_match_list1.append(item)\n",
218
+ " else:\n",
219
+ " pos_select_list.append(item)"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": 14,
225
+ "id": "4116c8f9-277d-41eb-bd4b-e66e20336ab5",
226
+ "metadata": {},
227
+ "outputs": [],
228
+ "source": [
229
+ "#ORF匹配较多的\n",
230
+ "from Bio import Align\n",
231
+ "\n",
232
+ "# 创建PairwiseAligner对象\n",
233
+ "aligner = Align.PairwiseAligner()\n",
234
+ "\n",
235
+ "# 设置比对模式为全局比对\n",
236
+ "aligner.mode = \"global\"\n",
237
+ "\n",
238
+ "for item in not_match_list1:\n",
239
+ " example_dna = item[\"seq_a\"]\n",
240
+ " example_protein = item[\"seq_b\"]\n",
241
+ " label = item[\"label\"]\n",
242
+ " \n",
243
+ " protein_trans = find_longest_orf(example_dna)\n",
244
+ "\n",
245
+ " \n",
246
+ " if 1==label:\n",
247
+ "\n",
248
+ " alignments = aligner.align(example_protein, protein_trans)\n",
249
+ " score = alignments[0].score\n",
250
+ " protein_trans_len = len(protein_trans)\n",
251
+ "\n",
252
+ " sim_score = score/protein_trans_len\n",
253
+ "\n",
254
+ " \n",
255
+ " if sim_score > 0.8:#匹配较高的\n",
256
+ " pos_select_list.append(item)"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 15,
262
+ "id": "b7a303cc-1bfc-4fe4-9428-addca7b000ea",
263
+ "metadata": {},
264
+ "outputs": [
265
+ {
266
+ "name": "stdout",
267
+ "output_type": "stream",
268
+ "text": [
269
+ "17067 6309\n"
270
+ ]
271
+ }
272
+ ],
273
+ "source": [
274
+ "print(len(neg_select_list),len(pos_select_list))"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": 9,
280
+ "id": "37ee3361-2fd6-441b-bb59-f355a95debae",
281
+ "metadata": {},
282
+ "outputs": [
283
+ {
284
+ "name": "stdout",
285
+ "output_type": "stream",
286
+ "text": [
287
+ "target 0 MA--ES-REPR-GAVE----A----EL-DPVEYTLRK------R-----L------P-H-\n",
288
+ " 0 |---|--|----||------|----|--|-------|------|-----|------|-|-\n",
289
+ "query 0 M-WQE-AR---HGA--WHRRATSGSE-QD-------KKKFIGGRWSHTPLVQKTTVPVHG\n",
290
+ "\n",
291
+ "target 28 ---RLP--R-RPN-DVYVNMKTDFK-AQL------A----RCQKLLDCG-ARG-------\n",
292
+ " 60 ---|-|--|-|---|------|----|--------|----|-----|---|-|-------\n",
293
+ "query 45 HGER-PTSRGR--QD------T---TA--PRTPSSAGSVPR-----D--VA-GGPPRRLA\n",
294
+ "\n",
295
+ "target 62 Q-SACSEIYIHGLG-L----AIN----RA--INIA-LQLQAGS-F---GALQ--VAAN--\n",
296
+ " 120 |-|-------|-|--|----|------|---|--|-|-|-----|---|-----|-----\n",
297
+ "query 83 QKS-------H-L-WLGYWGA--TLKTR-MWI--AEL-L----RFRITG---SRV---SV\n",
298
+ "\n",
299
+ "target 101 ------TS-TVELVDELEPE--TDT-RE-PVI-------RN-----RNNS--AIHIRVF-\n",
300
+ " 180 ------||-|||-|--|-----|---|--|---------|------|-----|---|---\n",
301
+ "query 118 SGSSSSTSSTVE-V--L---AAT--CR-AP--KLPACSCR-AMLMAR---LMA---R--P\n",
302
+ "\n",
303
+ "target 135 RVAPQ-- 140\n",
304
+ " 240 |--|--- 247\n",
305
+ "query 158 R--P-WM 162\n",
306
+ "\n",
307
+ "比对得分: 55.0\n",
308
+ "查询序列: MWQEARHGAWHRRATSGSEQDKKKFIGGRWSHTPLVQKTTVPVHGHGERPTSRGRQDTTAPRTPSSAGSVPRDVAGGPPRRLAQKSHLWLGYWGATLKTRMWIAELLRFRITGSRVSVSGSSSSTSSTVEVLAATCRAPKLPACSCRAMLMARLMARPRPWM\n",
309
+ "目标序列: MAESREPRGAVEAELDPVEYTLRKRLPHRLPRRPNDVYVNMKTDFKAQLARCQKLLDCGARGQSACSEIYIHGLGLAINRAINIALQLQAGSFGALQVAANTSTVELVDELEPETDTREPVIRNRNNSAIHIRVFRVAPQ\n",
310
+ "-----------------------------------------------------------\n",
311
+ "\n"
312
+ ]
313
+ }
314
+ ],
315
+ "source": [
316
+ "from Bio import Align\n",
317
+ "\n",
318
+ "# 创建PairwiseAligner对象\n",
319
+ "aligner = Align.PairwiseAligner()\n",
320
+ "\n",
321
+ "# 设置比对模式为全局比对\n",
322
+ "aligner.mode = \"global\"\n",
323
+ "\n",
324
+ "\n",
325
+ "# 示例蛋白质序列\n",
326
+ "seq1 = \"MAESREPRGAVEAELDPVEYTLRKRLPHRLPRRPNDVYVNMKTDFKAQLARCQKLLDCGARGQSACSEIYIHGLGLAINRAINIALQLQAGSFGALQVAANTSTVELVDELEPETDTREPVIRNRNNSAIHIRVFRVAPQ\"\n",
327
+ "seq2 = \"MWQEARHGAWHRRATSGSEQDKKKFIGGRWSHTPLVQKTTVPVHGHGERPTSRGRQDTTAPRTPSSAGSVPRDVAGGPPRRLAQKSHLWLGYWGATLKTRMWIAELLRFRITGSRVSVSGSSSSTSSTVEVLAATCRAPKLPACSCRAMLMARLMARPRPWM\"\n",
328
+ "\n",
329
+ "\n",
330
+ "# 执行比对\n",
331
+ "alignments = aligner.align(seq1, seq2)\n",
332
+ "\n",
333
+ "# 输出比对结果\n",
334
+ "for alignment in alignments:\n",
335
+ " print(alignment)\n",
336
+ " print(f\"比对得分: {alignment.score}\")\n",
337
+ " print(f\"查询序列: {alignment.query}\")\n",
338
+ " print(f\"目标序列: {alignment.target}\")\n",
339
+ " print(\"-----------------------------------------------------------\\n\")\n",
340
+ " break"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": null,
346
+ "id": "a3499a28-f01d-4e48-9c49-916763cde800",
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": []
350
+ }
351
+ ],
352
+ "metadata": {
353
+ "kernelspec": {
354
+ "display_name": "Python 3 (ipykernel)",
355
+ "language": "python",
356
+ "name": "python3"
357
+ },
358
+ "language_info": {
359
+ "codemirror_mode": {
360
+ "name": "ipython",
361
+ "version": 3
362
+ },
363
+ "file_extension": ".py",
364
+ "mimetype": "text/x-python",
365
+ "name": "python",
366
+ "nbconvert_exporter": "python",
367
+ "pygments_lexer": "ipython3",
368
+ "version": "3.11.9"
369
+ }
370
+ },
371
+ "nbformat": 4,
372
+ "nbformat_minor": 5
373
+ }
get_data/get_dna_protein_pair_rand.ipynb ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "cabe185c-850a-45be-a1fe-a0913bf921a3",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "#获得dna-蛋白质数据"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "9ff80573-0411-4244-8fdc-488f1592e5cf",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "--2025-02-15 18:51:21-- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz\n",
24
+ " => ‘uniprot_sprot.fasta.gz’\n",
25
+ "Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195\n",
26
+ "Connecting to ftp.uniprot.org (ftp.uniprot.org)|128.175.240.195|:21... connected.\n",
27
+ "Logging in as anonymous ... Logged in!\n",
28
+ "==> SYST ... done. ==> PWD ... done.\n",
29
+ "==> TYPE I ... done. ==> CWD (1) /pub/databases/uniprot/current_release/knowledgebase/complete ... done.\n",
30
+ "==> SIZE uniprot_sprot.fasta.gz ... 92924866\n",
31
+ "==> PASV ... done. ==> RETR uniprot_sprot.fasta.gz ... done.\n",
32
+ "Length: 92924866 (89M) (unauthoritative)\n",
33
+ "\n",
34
+ "uniprot_sprot.fasta 100%[===================>] 88.62M 284KB/s in 3m 38s \n",
35
+ "\n",
36
+ "2025-02-15 18:55:02 (417 KB/s) - ‘uniprot_sprot.fasta.gz’ saved [92924866]\n",
37
+ "\n",
38
+ "tar: This does not look like a tar archive\n",
39
+ "tar: Skipping to next header\n",
40
+ "tar: Exiting with failure status due to previous errors\n"
41
+ ]
42
+ }
43
+ ],
44
+ "source": [
45
+ "#获得蛋白质fasta数据\n",
46
+ "!wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 2,
52
+ "id": "476f187e-7c70-4c19-bb81-9df4b4360529",
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "!gunzip uniprot_sprot.fasta.gz"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 3,
62
+ "id": "2c4bf4a6-8f82-4b12-aa66-f5dd89929cf2",
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "!grep \">sp\" uniprot_sprot.fasta|awk -F \"|\" '{print $2}' > uniprot_sprot.fasta.id"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 5,
72
+ "id": "36ff4e11-0d8e-46a5-956e-26cac817783b",
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stderr",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/Bio/pairwise2.py:278: BiopythonDeprecationWarning: Bio.pairwise2 has been deprecated, and we intend to remove it in a future release of Biopython. As an alternative, please consider using Bio.Align.PairwiseAligner as a replacement, and contact the Biopython developers if you still need the Bio.pairwise2 module.\n",
80
+ " warnings.warn(\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "from Bio import Entrez, SeqIO\n",
86
+ "from Bio.Seq import Seq\n",
87
+ "import requests\n",
88
+ "from io import StringIO\n",
89
+ "import re\n",
90
+ "from Bio import pairwise2\n",
91
+ "from Bio.pairwise2 import format_alignment\n",
92
+ "\n",
93
+ "Entrez.email = \"wangliang.f@gmail.com\" #ncbi自己注册一个邮箱。https://www.ncbi.nlm.nih.gov/account/login/"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 6,
99
+ "id": "2513d8b9-edfb-4e34-8615-57d291f53557",
100
+ "metadata": {},
101
+ "outputs": [
102
+ {
103
+ "data": {
104
+ "text/plain": [
105
+ "'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL'"
106
+ ]
107
+ },
108
+ "execution_count": 6,
109
+ "metadata": {},
110
+ "output_type": "execute_result"
111
+ }
112
+ ],
113
+ "source": [
114
+ "#step 1,获得完整的fasta数据\n",
115
+ "def fetch_uniprot_protein_sequence(uniprot_id):\n",
116
+ " url = f\"https://www.uniprot.org/uniprot/{uniprot_id}.fasta\"\n",
117
+ " response = requests.get(url)\n",
118
+ " if response.status_code == 200:\n",
119
+ " fasta_data = response.text\n",
120
+ " record = SeqIO.read(StringIO(fasta_data), \"fasta\")\n",
121
+ " return str(record.seq)\n",
122
+ " else:\n",
123
+ " raise ValueError(f\"未能从 UniProt 获取蛋白质序列,状态码:{response.status_code}\")\n",
124
+ "\n",
125
+ "uniprot_id = \"Q6GZX4\" #第一条数据为例\n",
126
+ "protein_sequence = fetch_uniprot_protein_sequence(uniprot_id)\n",
127
+ "protein_sequence"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 7,
133
+ "id": "01805f1f-213e-4212-8873-2c0a83206840",
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "data": {
138
+ "text/plain": [
139
+ "'81941549'"
140
+ ]
141
+ },
142
+ "execution_count": 7,
143
+ "metadata": {},
144
+ "output_type": "execute_result"
145
+ }
146
+ ],
147
+ "source": [
148
+ "#step 2, 获得ncbi的蛋白质id,注意这个蛋白质id和uniprot的不一样\n",
149
+ "handle = Entrez.esearch(db=\"protein\", term=uniprot_id)\n",
150
+ "record = Entrez.read(handle)\n",
151
+ "handle.close()\n",
152
+ "\n",
153
+ "protein_ncbi_id = record[\"IdList\"][0]\n",
154
+ "protein_ncbi_id"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 21,
160
+ "id": "746deb4d-40ec-4235-9ec4-c45c84889da9",
161
+ "metadata": {},
162
+ "outputs": [
163
+ {
164
+ "name": "stdout",
165
+ "output_type": "stream",
166
+ "text": [
167
+ "AY548484.1\n"
168
+ ]
169
+ }
170
+ ],
171
+ "source": [
172
+ "#step 3,获得ncbi中的数据使用ncbi id\n",
173
+ "def extract_first_xref_id(info_string):\n",
174
+ " \"\"\"\n",
175
+ " 从给定的字符串中提取 xrefs: 后面的第一个 ID。\n",
176
+ " \n",
177
+ " 参数:\n",
178
+ " - info_string (str): 包含 UniProtKB 信息的字符串\n",
179
+ " \n",
180
+ " 返回:\n",
181
+ " - str 或 None: 如果找到,则返回第一个 ID;否则返回 None。\n",
182
+ " \"\"\"\n",
183
+ " # 使用正则表达式查找 'xrefs:' 后面的第一个 ID\n",
184
+ " match = re.search(r'xrefs:\\s*([\\w.-]+)', info_string)\n",
185
+ " if match:\n",
186
+ " return match.group(1) # 返回匹配到的第一个 ID\n",
187
+ " else:\n",
188
+ " return None\n",
189
+ "\n",
190
+ "\n",
191
+ "def fetch_ncbi_genbank_data(protein_ncbi_id):\n",
192
+ " #STEP 1, 获得protein 数据\n",
193
+ " handle = Entrez.efetch(db=\"protein\", id=protein_ncbi_id, rettype=\"gb\", retmode=\"text\")\n",
194
+ " genbank_data = handle.read()\n",
195
+ " handle.close()\n",
196
+ "\n",
197
+ " #获得dna的id\n",
198
+ " rec = SeqIO.parse(StringIO(genbank_data), \"genbank\")\n",
199
+ " for item in rec:\n",
200
+ " record = item\n",
201
+ " break\n",
202
+ "\n",
203
+ " db_source = record.annotations[\"db_source\"]\n",
204
+ " xref_id = extract_first_xref_id(db_source)\n",
205
+ "\n",
206
+ " print(xref_id)\n",
207
+ " \n",
208
+ "\n",
209
+ " #step2,获得dna数据\n",
210
+ " r_handle = Entrez.efetch(id=xref_id, db='nucleotide', rettype='gb', retmode='text')\n",
211
+ " dna_data = r_handle.read()\n",
212
+ " r_handle.close()\n",
213
+ " \n",
214
+ " return dna_data\n",
215
+ "\n",
216
+ "genbank_data = fetch_ncbi_genbank_data(protein_ncbi_id)\n",
217
+ "#print(genbank_data)"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 15,
223
+ "id": "90be7571-db8d-4bc8-b6ee-0b734a4d112e",
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "name": "stdout",
228
+ "output_type": "stream",
229
+ "text": [
230
+ "LOCUS 001R_FRG3G 256 aa linear VRL 08-NOV-2023\n",
231
+ "DEFINITION RecName: Full=Putative transcription factor 001R.\n",
232
+ "ACCESSION Q6GZX4\n",
233
+ "VERSION Q6GZX4.1\n",
234
+ "DBSOURCE UniProtKB: locus 001R_FRG3G, accession Q6GZX4;\n",
235
+ " class: standard.\n",
236
+ " created: Jun 28, 2011.\n",
237
+ " sequence updated: Jul 19, 2004.\n",
238
+ " annotation updated: Nov 8, 2023.\n",
239
+ " xrefs: AY548484.1, AAT09660.1, YP_031579.1\n",
240
+ " xrefs (non-sequence databases): SwissPalm:Q6GZX4, GeneID:2947773,\n",
241
+ " KEGG:vg:2947773, Proteomes:UP000008770, GO:0046782,\n",
242
+ " InterPro:IPR007031, Pfam:PF04947\n",
243
+ "KEYWORDS Activator; Reference proteome; Transcription; Transcription\n",
244
+ " regulation.\n",
245
+ "SOURCE Frog virus 3 (isolate Goorha)\n",
246
+ " ORGANISM Frog virus 3 (isolate Goorha)\n",
247
+ " Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota;\n",
248
+ " Megaviricetes; Pimascovirales; Iridoviridae; Alphairidovirinae;\n",
249
+ " Ranavirus; Frog virus 3.\n",
250
+ "REFERENCE 1 (residues 1 to 256)\n",
251
+ " AUTHORS Tan,W.G., Barkman,T.J., Gregory Chinchar,V. and Essani,K.\n",
252
+ " TITLE Comparative genomic analyses of frog virus 3, type species of the\n",
253
+ " genus Ranavirus (family Iridoviridae)\n",
254
+ " JOURNAL Virology 323 (1), 70-84 (2004)\n",
255
+ " PUBMED 15165820\n",
256
+ " REMARK NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n",
257
+ "COMMENT [FUNCTION] Transcription activation. {ECO:0000305}.\n",
258
+ "FEATURES Location/Qualifiers\n",
259
+ " source 1..256\n",
260
+ " /organism=\"Frog virus 3 (isolate Goorha)\"\n",
261
+ " /host=\"Dryophytes versicolor (chameleon treefrog)\"\n",
262
+ " /host=\"Lithobates pipiens (Northern leopard frog) (Rana\n",
263
+ " pipiens)\"\n",
264
+ " /host=\"Lithobates sylvaticus (Wood frog) (Rana sylvatica)\"\n",
265
+ " /host=\"Notophthalmus viridescens (Eastern newt) (Triturus\n",
266
+ " viridescens)\"\n",
267
+ " /db_xref=\"taxon:654924\"\n",
268
+ " gene 1..256\n",
269
+ " /locus_tag=\"FV3-001R\"\n",
270
+ " Protein 1..256\n",
271
+ " /product=\"Putative transcription factor 001R\"\n",
272
+ " /UniProtKB_evidence=\"Predicted\"\n",
273
+ " Region 1..256\n",
274
+ " /region_name=\"Mature chain\"\n",
275
+ " /note=\"Putative transcription factor 001R.\n",
276
+ " /id=PRO_0000410512.\"\n",
277
+ " Region 81..253\n",
278
+ " /region_name=\"Pox_VLTF3\"\n",
279
+ " /note=\"Poxvirus Late Transcription Factor VLTF3 like;\n",
280
+ " pfam04947\"\n",
281
+ " /db_xref=\"CDD:282761\"\n",
282
+ "ORIGIN \n",
283
+ " 1 mafsaedvlk eydrrrrmea lllslyypnd rklldykews pprvqvecpk apvewnnpps\n",
284
+ " 61 ekglivghfs gikykgekaq asevdvnkmc cwvskfkdam rryqgiqtck ipgkvlsdld\n",
285
+ " 121 akikaynltv egvegfvrys rvtkqhvaaf lkelrhskqy envnlihyil tdkrvdiqhl\n",
286
+ " 181 ekdlvkdfka lvesahrmrq ghminvkyil yqllkkhghg pdgpdiltvk tgskgvlydd\n",
287
+ " 241 sfrkiytdlg wkftpl\n",
288
+ "//\n",
289
+ "\n",
290
+ "\n"
291
+ ]
292
+ }
293
+ ],
294
+ "source": [
295
+ "#分步测试,STEP 1, 获得protein 数据 这个里面应该有对应的dna数据CDS的,但其实没有。。\n",
296
+ "handle = Entrez.efetch(db=\"protein\", id=\"81941549\", rettype=\"gb\", retmode=\"text\")\n",
297
+ "genbank_data_protein = handle.read()\n",
298
+ "handle.close()\n",
299
+ "print(genbank_data_protein) #需要其中的db_source中xrefs里面的数据中的第1个,也就是AY548484.1"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": 17,
305
+ "id": "1b3c03e0-ea22-4ec4-aa4d-4994485114f1",
306
+ "metadata": {},
307
+ "outputs": [],
308
+ "source": [
309
+ "def calculate_similarity(seq1, seq2):\n",
310
+ " \"\"\"使用局部比对计算两个序列之间的相似度得分\"\"\"\n",
311
+ " alignments = pairwise2.align.localxx(seq1, seq2)\n",
312
+ " best_score = max(aln.score for aln in alignments) if alignments else 0\n",
313
+ " return best_score\n",
314
+ "\n",
315
+ "def extract_cds_and_translate(genbank_data, protein_seq_ori=None):\n",
316
+ " results = []\n",
317
+ " record = None\n",
318
+ "\n",
319
+ " # 使用 StringIO 加载 GenBank 数据\n",
320
+ " for rec in SeqIO.parse(StringIO(genbank_data), \"genbank\"):\n",
321
+ " record = rec\n",
322
+ " break\n",
323
+ "\n",
324
+ " \n",
325
+ " if not record:\n",
326
+ " raise ValueError(\"未能成功解析 GenBank 数据\")\n",
327
+ "\n",
328
+ " gene_id = record.id\n",
329
+ "\n",
330
+ " for feature in record.features:\n",
331
+ " if feature.type == \"CDS\":\n",
332
+ " cds_start = feature.location.start\n",
333
+ " cds_end = feature.location.end\n",
334
+ " cds_sequence = record.seq[cds_start:cds_end]\n",
335
+ " protein_sequence = feature.qualifiers[\"translation\"][0]\n",
336
+ " protein_id = feature.qualifiers[\"protein_id\"][0]\n",
337
+ " \n",
338
+ " sim_score = calculate_similarity(protein_sequence, protein_seq_ori)\n",
339
+ " results.append({\n",
340
+ " \"protein_id\":protein_id,\n",
341
+ " \"gene_id\": gene_id ,\n",
342
+ " \"cds_start\": cds_start,\n",
343
+ " \"cds_end\": cds_end,\n",
344
+ " \"dna_sequence\": str(cds_sequence),\n",
345
+ " \"protein_sequence\": str(protein_sequence),\n",
346
+ " \"sim\":sim_score\n",
347
+ " })\n",
348
+ " # 使用 sorted() 函数并指定 key 和 reverse 参数\n",
349
+ " sorted_results = sorted(results, key=lambda x: x['sim'], reverse=True)\n",
350
+ " return sorted_results\n",
351
+ "\n",
352
+ "cds_data_list = extract_cds_and_translate(genbank_data, protein_sequence)"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": 19,
358
+ "id": "fdd02166-9af8-433e-a6c6-051482759623",
359
+ "metadata": {},
360
+ "outputs": [
361
+ {
362
+ "data": {
363
+ "text/plain": [
364
+ "{'protein_id': 'AAT09660.1',\n",
365
+ " 'gene_id': 'AY548484.1',\n",
366
+ " 'cds_start': ExactPosition(271),\n",
367
+ " 'cds_end': ExactPosition(1042),\n",
368
+ " 'dna_sequence': 'ATGGCATTCTCGGCAGAAGATGTGCTGAAGGAGTACGACAGGAGACGGAGGATGGAGGCCCTCTTGCTCAGCCTGTACTACCCAAACGACCGCAAGCTCCTAGACTACAAAGAGTGGTCTCCGCCCAGGGTTCAGGTAGAGTGTCCCAAAGCCCCCGTGGAGTGGAACAACCCTCCGTCAGAAAAGGGTCTCATCGTGGGGCACTTTAGCGGCATAAAGTACAAGGGGGAAAAGGCTCAGGCATCCGAGGTAGACGTCAACAAGATGTGCTGCTGGGTGTCCAAGTTTAAAGACGCCATGAGGAGGTACCAGGGCATACAGACTTGCAAGATCCCCGGCAAGGTCCTGTCGGACCTCGACGCCAAAATAAAGGCTTACAACCTCACCGTTGAGGGCGTAGAGGGTTTCGTGAGGTACTCACGAGTGACCAAGCAGCACGTAGCAGCTTTCCTCAAGGAGCTCAGGCACTCTAAGCAGTACGAAAACGTCAACCTCATCCACTACATCCTCACCGACAAGAGGGTAGACATTCAGCACCTGGAAAAGGATCTTGTCAAGGATTTTAAGGCGCTGGTGGAATCTGCTCACAGGATGAGGCAGGGCCACATGATCAACGTAAAGTACATACTCTACCAGCTCCTCAAGAAGCACGGTCACGGGCCAGACGGTCCAGACATCCTGACCGTAAAGACTGGAAGCAAGGGAGTCTTGTACGACGATTCCTTTCGCAAGATTTACACGGACCTCGGGTGGAAGTTTACCCCCCTATGA',\n",
369
+ " 'protein_sequence': 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL',\n",
370
+ " 'sim': 256.0}"
371
+ ]
372
+ },
373
+ "execution_count": 19,
374
+ "metadata": {},
375
+ "output_type": "execute_result"
376
+ }
377
+ ],
378
+ "source": [
379
+ "cds_data_list[0]"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": null,
385
+ "id": "58031699-a779-44f8-b88e-2efb6b53d757",
386
+ "metadata": {},
387
+ "outputs": [],
388
+ "source": []
389
+ }
390
+ ],
391
+ "metadata": {
392
+ "kernelspec": {
393
+ "display_name": "Python 3 (ipykernel)",
394
+ "language": "python",
395
+ "name": "python3"
396
+ },
397
+ "language_info": {
398
+ "codemirror_mode": {
399
+ "name": "ipython",
400
+ "version": 3
401
+ },
402
+ "file_extension": ".py",
403
+ "mimetype": "text/x-python",
404
+ "name": "python",
405
+ "nbconvert_exporter": "python",
406
+ "pygments_lexer": "ipython3",
407
+ "version": "3.11.9"
408
+ }
409
+ },
410
+ "nbformat": 4,
411
+ "nbformat_minor": 5
412
+ }
get_data/get_lucaone_data.ipynb ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "36cb4bfb-c6e0-4924-a91d-57dfccd63801",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "#获得lucaone的中心法则相关数据"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "01c3e3cc-7f70-4f9f-89f4-ed236845e64e",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "--2025-02-15 11:56:31-- http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/train/train.csv\n",
24
+ "Connecting to 47.93.21.181:80... connected.\n",
25
+ "HTTP request sent, awaiting response... 200 OK\n",
26
+ "Length: 5302028 (5.1M) [application/octet-stream]\n",
27
+ "Saving to: ‘train.csv’\n",
28
+ "\n",
29
+ "train.csv 100%[===================>] 5.06M 12.7MB/s in 0.4s \n",
30
+ "\n",
31
+ "2025-02-15 11:56:31 (12.7 MB/s) - ‘train.csv’ saved [5302028/5302028]\n",
32
+ "\n",
33
+ "--2025-02-15 11:56:31-- http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/test/test.csv\n",
34
+ "Connecting to 47.93.21.181:80... connected.\n",
35
+ "HTTP request sent, awaiting response... 200 OK\n",
36
+ "Length: 33131633 (32M) [application/octet-stream]\n",
37
+ "Saving to: ‘test.csv’\n",
38
+ "\n",
39
+ "test.csv 100%[===================>] 31.60M 9.59MB/s in 3.3s \n",
40
+ "\n",
41
+ "2025-02-15 11:56:35 (9.59 MB/s) - ‘test.csv’ saved [33131633/33131633]\n",
42
+ "\n",
43
+ "--2025-02-15 11:56:35-- http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/dev/dev.csv\n",
44
+ "Connecting to 47.93.21.181:80... connected.\n",
45
+ "HTTP request sent, awaiting response... 200 OK\n",
46
+ "Length: 3982657 (3.8M) [application/octet-stream]\n",
47
+ "Saving to: ‘dev.csv’\n",
48
+ "\n",
49
+ "dev.csv 100%[===================>] 3.80M 8.46MB/s in 0.4s \n",
50
+ "\n",
51
+ "2025-02-15 11:56:35 (8.46 MB/s) - ‘dev.csv’ saved [3982657/3982657]\n",
52
+ "\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "!wget http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/train/train.csv\n",
58
+ "!wget http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/test/test.csv\n",
59
+ "!wget http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/dev/dev.csv"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": 3,
65
+ "id": "23d39250-50a6-4557-9cb9-2d637ce52835",
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "#合并文件\n",
70
+ "# 先复制第一个文件的内容(包括表头)\n",
71
+ "!cp train.csv central_dogma.csv\n",
72
+ "\n",
73
+ "# 然后将后续文件的内容(跳过表头)追加到combined.csv中\n",
74
+ "!tail -n +2 test.csv >> central_dogma.csv\n",
75
+ "!tail -n +2 dev.csv >> central_dogma.csv"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "id": "d54081eb-d30c-4b3f-8a6d-0d1b53c37fd7",
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": []
85
+ }
86
+ ],
87
+ "metadata": {
88
+ "kernelspec": {
89
+ "display_name": "Python 3 (ipykernel)",
90
+ "language": "python",
91
+ "name": "python3"
92
+ },
93
+ "language_info": {
94
+ "codemirror_mode": {
95
+ "name": "ipython",
96
+ "version": 3
97
+ },
98
+ "file_extension": ".py",
99
+ "mimetype": "text/x-python",
100
+ "name": "python",
101
+ "nbconvert_exporter": "python",
102
+ "pygments_lexer": "ipython3",
103
+ "version": "3.11.9"
104
+ }
105
+ },
106
+ "nbformat": 4,
107
+ "nbformat_minor": 5
108
+ }
get_data/get_protein_dna_pair.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Bio import Entrez, SeqIO
2
+ from Bio.Seq import Seq
3
+ import requests
4
+ from io import StringIO
5
+ import re
6
+ from Bio import pairwise2
7
+ from Bio.pairwise2 import format_alignment
8
+
9
+ Entrez.email = "wangliang.f@gmail.com"
10
+
11
+
12
+ def calculate_similarity(seq1, seq2):
13
+ """使用局部比对计算两个序列之间的相似度得分"""
14
+ alignments = pairwise2.align.localxx(seq1, seq2)
15
+ best_score = max(aln.score for aln in alignments) if alignments else 0
16
+ return best_score
17
+
18
+
19
+
20
+ def extract_first_xref_id(info_string):
21
+ """
22
+ 从给定的字符串中提取 xrefs: 后面的第一个 ID。
23
+
24
+ 参数:
25
+ - info_string (str): 包含 UniProtKB 信息的字符串
26
+
27
+ 返回:
28
+ - str 或 None: 如果找到,则返回第一个 ID;否则返回 None。
29
+ """
30
+ # 使用正则表达式查找 'xrefs:' 后面的第一个 ID
31
+ match = re.search(r'xrefs:\s*([\w.-]+)', info_string)
32
+ if match:
33
+ return match.group(1) # 返回匹配到的第一个 ID
34
+ else:
35
+ return None
36
+
37
+
38
+
39
+ def fetch_uniprot_protein_sequence(uniprot_id):
40
+ url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
41
+ response = requests.get(url)
42
+ if response.status_code == 200:
43
+ fasta_data = response.text
44
+ record = SeqIO.read(StringIO(fasta_data), "fasta")
45
+ return str(record.seq)
46
+ else:
47
+ raise ValueError(f"未能从 UniProt 获取蛋白质序列,状态码:{response.status_code}")
48
+
49
+ def fetch_ncbi_genbank_data(protein_ncbi_id):
50
+ #STEP 1, 获得protein 数据
51
+ handle = Entrez.efetch(db="protein", id=protein_ncbi_id, rettype="gb", retmode="text")
52
+ genbank_data = handle.read()
53
+ handle.close()
54
+
55
+ #获得dna的id
56
+ rec = SeqIO.parse(StringIO(genbank_data), "genbank")
57
+ for item in rec:
58
+ record = item
59
+ break
60
+
61
+ #protein_seq = str(record.seq)
62
+ db_source = record.annotations["db_source"]
63
+ #print("db_source", db_source)
64
+ xref_id = extract_first_xref_id(db_source)
65
+ print("xref_id", xref_id)
66
+
67
+
68
+ #step2,获得dna数据
69
+ r_handle = Entrez.efetch(id=xref_id, db='nucleotide', rettype='gb', retmode='text')
70
+ dna_data = r_handle.read()
71
+ r_handle.close()
72
+
73
+ return dna_data
74
+
75
+ def extract_cds_and_translate(genbank_data, protein_seq_ori=None):
76
+ results = []
77
+ record = None
78
+
79
+ # 使用 StringIO 加载 GenBank 数据
80
+ for rec in SeqIO.parse(StringIO(genbank_data), "genbank"):
81
+ record = rec
82
+ break
83
+
84
+
85
+ if not record:
86
+ raise ValueError("未能成功解析 GenBank 数据")
87
+
88
+ gene_id = record.id
89
+
90
+ for feature in record.features:
91
+ if feature.type == "CDS":
92
+ #protein_sequence = features.qualifiers["translation"][0]
93
+ # if protein_id and "protein_id" in feature.qualifiers:
94
+ # if protein_id != feature.qualifiers["protein_id"][0]:
95
+ # continue
96
+
97
+ cds_start = feature.location.start
98
+ cds_end = feature.location.end
99
+ cds_sequence = record.seq[cds_start:cds_end]
100
+ #protein_sequence = cds_sequence.translate(to_stop=True)
101
+ protein_sequence = feature.qualifiers["translation"][0]
102
+ protein_id = feature.qualifiers["protein_id"][0]
103
+
104
+ sim_score = calculate_similarity(protein_sequence, protein_seq_ori)
105
+ #if p_s==protein_seq: #只要1个
106
+ results.append({
107
+ "gene_id":gene_id,
108
+ "protein_id":protein_id,
109
+ "cds_start": cds_start,
110
+ "cds_end": cds_end,
111
+ "dna_sequence": str(cds_sequence),
112
+ "protein_sequence": str(protein_sequence),
113
+ "sim":sim_score
114
+ })
115
+ # 使用 sorted() 函数并指定 key 和 reverse 参数
116
+ sorted_results = sorted(results, key=lambda x: x['sim'], reverse=True)
117
+ return sorted_results
118
+
119
+ def get_protein_and_dna_sequences(uniprot_id):
120
+ """
121
+ 主函数,根据蛋白质id,获得dna和蛋白质匹配对
122
+ :param uniprot_id:
123
+ :return:
124
+ """
125
+ try:
126
+ print(f"正在获取 UniProt ID: {uniprot_id} 的蛋白质序列...")
127
+ protein_sequence = fetch_uniprot_protein_sequence(uniprot_id)
128
+
129
+ print("正在获取 NCBI 数据...")
130
+ handle = Entrez.esearch(db="protein", term=uniprot_id)
131
+ record = Entrez.read(handle)
132
+ handle.close()
133
+
134
+ if not record["IdList"]:
135
+ raise ValueError("未找到对应的蛋白质记录")
136
+
137
+ protein_ncbi_id = record["IdList"][0]
138
+
139
+ #print("protein_ncbi_id", protein_ncbi_id)
140
+
141
+ genbank_data = fetch_ncbi_genbank_data(protein_ncbi_id)
142
+
143
+ #print("genbank_data", genbank_data)
144
+
145
+ print("正在提取 DNA 和蛋白质序列...")
146
+ cds_data = extract_cds_and_translate(genbank_data, protein_sequence)
147
+
148
+ return {
149
+ "uniprot_id": uniprot_id,
150
+ "protein_sequence": protein_sequence,
151
+ "cds_data": cds_data,
152
+ }
153
+
154
+ except Exception as e:
155
+ print(f"发生错误:{e}")
156
+ return {"error": str(e)}
157
+
158
+ def process_data(uniprot_id):
159
+ # 示例:获取蛋白质和 DNA 序列
160
+ result = get_protein_and_dna_sequences(uniprot_id)
161
+ if "error" in result:
162
+ print(f"错误:{result['error']}")
163
+ return -1
164
+ else:
165
+ #print(result)
166
+ if len(result["cds_data"])>0:
167
+ gene_data = result["cds_data"][0]
168
+ data = {
169
+ "seq_id_a":gene_data["gene_id"],
170
+ "seq_type_a":"gene",
171
+ "seq_a":gene_data["dna_sequence"],
172
+ "seq_id_b":uniprot_id,
173
+ "seq_type_b":"pro",
174
+ "seq_b":gene_data["protein_sequence"],
175
+ "protein_id":gene_data["protein_id"],
176
+ }
177
+ return data
178
+ else:
179
+ return -1
180
+
181
+ if __name__=="__main__":
182
+ ret = process_data("Q9Z3S1")
183
+ print(ret)