Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- get_data/central_dogma.csv +3 -0
- get_data/convert_lucaone_data.ipynb +373 -0
- get_data/get_dna_protein_pair_rand.ipynb +412 -0
- get_data/get_lucaone_data.ipynb +108 -0
- get_data/get_protein_dna_pair.py +183 -0
.gitattributes
CHANGED
|
@@ -45,3 +45,4 @@ train_data/ja_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
|
|
| 45 |
train_data/ko_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
|
| 46 |
train_data/protein_4g.txt filter=lfs diff=lfs merge=lfs -text
|
| 47 |
train_data/zh_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 45 |
train_data/ko_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
|
| 46 |
train_data/protein_4g.txt filter=lfs diff=lfs merge=lfs -text
|
| 47 |
train_data/zh_wiki_4g.txt filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
get_data/central_dogma.csv filter=lfs diff=lfs merge=lfs -text
|
get_data/central_dogma.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cdf03ee452e252ac282c561121a06cff89c80544f34c03a77a8558b46c14e228
|
| 3 |
+
size 42416200
|
get_data/convert_lucaone_data.ipynb
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "f3ad096d-492a-4da2-a390-03c7e7453821",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stderr",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 14 |
+
" from .autonotebook import tqdm as notebook_tqdm\n",
|
| 15 |
+
"Generating train split: 25600 examples [00:00, 82207.06 examples/s]\n"
|
| 16 |
+
]
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"data": {
|
| 20 |
+
"text/plain": [
|
| 21 |
+
"DatasetDict({\n",
|
| 22 |
+
" train: Dataset({\n",
|
| 23 |
+
" features: ['seq_id_a', 'seq_id_b', 'seq_type_a', 'seq_type_b', 'seq_a', 'seq_b', 'label'],\n",
|
| 24 |
+
" num_rows: 25600\n",
|
| 25 |
+
" })\n",
|
| 26 |
+
"})"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"execution_count": 1,
|
| 30 |
+
"metadata": {},
|
| 31 |
+
"output_type": "execute_result"
|
| 32 |
+
}
|
| 33 |
+
],
|
| 34 |
+
"source": [
|
| 35 |
+
"from datasets import load_dataset\n",
|
| 36 |
+
"from transformers import AutoTokenizer, DataCollatorWithPadding\n",
|
| 37 |
+
"from transformers import Trainer\n",
|
| 38 |
+
"import evaluate\n",
|
| 39 |
+
"import numpy as np\n",
|
| 40 |
+
"from transformers import TrainingArguments\n",
|
| 41 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"raw_datasets = load_dataset('csv', data_files='central_dogma.csv')\n",
|
| 44 |
+
"raw_datasets"
|
| 45 |
+
]
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"cell_type": "code",
|
| 49 |
+
"execution_count": 3,
|
| 50 |
+
"id": "8d89ca1d-1968-43d3-8d47-f9f87b02cd02",
|
| 51 |
+
"metadata": {},
|
| 52 |
+
"outputs": [
|
| 53 |
+
{
|
| 54 |
+
"data": {
|
| 55 |
+
"text/plain": [
|
| 56 |
+
"'LKIELASSYGFCFGVKRAIKIAENAGDAATIGPLIHNNEEINRLATNFNVKTLHGINELKDEKKAIIRTHGITKSDLAELKKTDIKVIDATCPFVTKPQQICEDMSNAGYDVVIFGDENHPEVKGVKSYASGKVYVVLDESELEGVKFRQKVALVSQTTRKVEKFMQIANYLMLRVKEVRVFNTICNATFENQEAVKNLAKRADVMIVIGGKNSSNTKQLYLISKNFCEDSYLIESEHEVEKSWFEGKNLCGISAGASTPDWIIQKVVDAIEKF*'"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
"execution_count": 3,
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"output_type": "execute_result"
|
| 62 |
+
}
|
| 63 |
+
],
|
| 64 |
+
"source": [
|
| 65 |
+
"from Bio.Seq import Seq\n",
|
| 66 |
+
"\n",
|
| 67 |
+
"def translate_dna_to_protein_biopython(dna_sequence):\n",
|
| 68 |
+
" \"\"\"Translate a DNA sequence into its corresponding protein sequence using Biopython.\"\"\"\n",
|
| 69 |
+
" # 确保输入的是大写的DNA序列\n",
|
| 70 |
+
" dna_seq = Seq(dna_sequence.upper())\n",
|
| 71 |
+
" \n",
|
| 72 |
+
" # 使用Biopython内置方法进行翻译\n",
|
| 73 |
+
" protein_seq = dna_seq.translate(to_stop=False) # 如果需要在终止密码子处停止,请设置to_stop=True\n",
|
| 74 |
+
" \n",
|
| 75 |
+
" return str(protein_seq)\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"def trim_sequence(sequence, front, length):\n",
|
| 78 |
+
" \"\"\"Trim the specified number of characters from the start and end of a string.\"\"\"\n",
|
| 79 |
+
" # if len(sequence) <= front + back:\n",
|
| 80 |
+
" # raise ValueError(\"The sequence is too short to trim the specified number of characters.\")\n",
|
| 81 |
+
" # return sequence[front:-back] if back > 0 else sequence[front:]\n",
|
| 82 |
+
" return sequence[front:front+length]\n",
|
| 83 |
+
"\n",
|
| 84 |
+
"translate_dna_to_protein_biopython(\"TTGAAGATTGAGCTTGCTAGCAGCTACGGCTTTTGCTTTGGGGTAAAGCGCGCCATAAAGATAGCCGAAAATGCGGGCGATGCCGCTACTATCGGGCCTCTCATACATAATAACGAAGAGATAAACCGCCTGGCTACGAATTTCAATGTCAAGACCCTCCACGGCATAAATGAGCTAAAGGACGAGAAAAAGGCCATCATACGCACTCACGGTATCACAAAAAGCGATCTGGCCGAGCTTAAAAAGACCGATATCAAAGTCATAGACGCCACTTGCCCGTTCGTGACCAAGCCGCAGCAAATTTGCGAGGATATGAGCAACGCAGGATACGATGTCGTGATATTTGGCGATGAAAATCATCCCGAAGTCAAAGGAGTGAAGTCCTATGCCAGCGGAAAGGTTTATGTCGTGCTCGATGAGAGCGAGCTTGAGGGAGTGAAATTTAGACAAAAGGTAGCACTCGTCAGTCAAACGACGCGCAAAGTCGAAAAATTTATGCAAATAGCGAACTACTTGATGCTACGCGTCAAAGAGGTGCGAGTTTTCAACACTATCTGCAACGCGACCTTCGAGAATCAGGAGGCGGTCAAAAATTTAGCCAAAAGAGCCGATGTGATGATAGTCATCGGTGGTAAAAATAGCTCTAATACAAAGCAGCTTTATCTGATATCTAAAAATTTCTGCGAGGACAGCTACCTGATAGAGAGCGAACACGAAGTCGAGAAAAGCTGGTTTGAAGGCAAGAATTTATGCGGTATAAGTGCGGGAGCGAGCACGCCTGATTGGATCATACAAAAAGTCGTCGACGCGATAGAGAAATTTTAA\")"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"cell_type": "code",
|
| 89 |
+
"execution_count": 11,
|
| 90 |
+
"id": "682a14ef-7492-434e-bed0-af0f5c03db86",
|
| 91 |
+
"metadata": {},
|
| 92 |
+
"outputs": [
|
| 93 |
+
{
|
| 94 |
+
"name": "stderr",
|
| 95 |
+
"output_type": "stream",
|
| 96 |
+
"text": [
|
| 97 |
+
"/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/Bio/Seq.py:2879: BiopythonWarning: Partial codon, len(sequence) not a multiple of three. Explicitly trim the sequence or add trailing N before translation. This may become an error in future.\n",
|
| 98 |
+
" warnings.warn(\n"
|
| 99 |
+
]
|
| 100 |
+
}
|
| 101 |
+
],
|
| 102 |
+
"source": [
|
| 103 |
+
"#获得完全匹配的正例\n",
|
| 104 |
+
"not_match_list = []\n",
|
| 105 |
+
"pos_select_list = [] #正例\n",
|
| 106 |
+
"neg_select_list = [] #负例\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"for item in raw_datasets[\"train\"]:\n",
|
| 109 |
+
" example_dna = item[\"seq_a\"]\n",
|
| 110 |
+
" example_protein = item[\"seq_b\"]\n",
|
| 111 |
+
" label = item[\"label\"]\n",
|
| 112 |
+
"\n",
|
| 113 |
+
" if 0==label: #负例都要\n",
|
| 114 |
+
" neg_select_list.append(item)\n",
|
| 115 |
+
" \n",
|
| 116 |
+
" dna_length = len(example_protein)*3\n",
|
| 117 |
+
" trimmed_sequence = trim_sequence(example_dna,100,dna_length)\n",
|
| 118 |
+
" protein_trans = translate_dna_to_protein_biopython(trimmed_sequence)\n",
|
| 119 |
+
"\n",
|
| 120 |
+
" if 1==label:\n",
|
| 121 |
+
" if protein_trans[1:-2]!=example_protein[1:-2]: #运行有前后1个字符不一样的\n",
|
| 122 |
+
" not_match_list.append(item)\n",
|
| 123 |
+
" else:\n",
|
| 124 |
+
" pos_select_list.append(item)"
|
| 125 |
+
]
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"cell_type": "code",
|
| 129 |
+
"execution_count": 12,
|
| 130 |
+
"id": "9abb1e66-5d3a-4d87-9bf3-577cec8efcf9",
|
| 131 |
+
"metadata": {},
|
| 132 |
+
"outputs": [
|
| 133 |
+
{
|
| 134 |
+
"name": "stdout",
|
| 135 |
+
"output_type": "stream",
|
| 136 |
+
"text": [
|
| 137 |
+
"最长的蛋白质序列: MARRPLVMGNWKLNGSKAFTKELITGLKDELNAVSGCDVAIAPPVMYLAEAETALVSSDIALGTQNVDLNKQGAFTGDISTEMLKDFGVKYVIIGHSERRQYHHESDEFIAKKFGVLKDAGLVPVLCIGESEAENEAGKTEEVCARQIDAVMNTLGVEAFNGAVIAYEPIWAIGTGKSATPAQAQAVHAFIRGHIAKQSQAVAERVIIQYGGSVNDANAAELFTQPDIDGALVGGASLKASAFAVIVKAAAKAKN\n"
|
| 138 |
+
]
|
| 139 |
+
}
|
| 140 |
+
],
|
| 141 |
+
"source": [
|
| 142 |
+
"from Bio.Seq import Seq\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"def find_longest_orf(dna_sequence_str):\n",
|
| 145 |
+
" dna_sequence = Seq(dna_sequence_str)\n",
|
| 146 |
+
" # 定义可能的终止密码子\n",
|
| 147 |
+
" stop_codons = ['TAA', 'TAG', 'TGA']\n",
|
| 148 |
+
" \n",
|
| 149 |
+
" # 初始化最长ORF和其长度\n",
|
| 150 |
+
" longest_orf = ''\n",
|
| 151 |
+
" longest_length = 0\n",
|
| 152 |
+
" \n",
|
| 153 |
+
" # 遍历正向链的三个阅读框\n",
|
| 154 |
+
" for frame in range(3):\n",
|
| 155 |
+
" seq = dna_sequence[frame:]\n",
|
| 156 |
+
" start = None\n",
|
| 157 |
+
" for i in range(0, len(seq) - 2, 3):\n",
|
| 158 |
+
" codon = seq[i:i+3]\n",
|
| 159 |
+
" if codon == 'ATG' and start is None:\n",
|
| 160 |
+
" start = i\n",
|
| 161 |
+
" if codon in stop_codons and start is not None:\n",
|
| 162 |
+
" orf = seq[start:i+3]\n",
|
| 163 |
+
" if len(orf) > longest_length:\n",
|
| 164 |
+
" longest_orf = orf\n",
|
| 165 |
+
" longest_length = len(orf)\n",
|
| 166 |
+
" start = None\n",
|
| 167 |
+
" \n",
|
| 168 |
+
" # 遍历反向互补链的三个阅读框\n",
|
| 169 |
+
" rev_seq = dna_sequence.reverse_complement()\n",
|
| 170 |
+
" for frame in range(3):\n",
|
| 171 |
+
" seq = rev_seq[frame:]\n",
|
| 172 |
+
" start = None\n",
|
| 173 |
+
" for i in range(0, len(seq) - 2, 3):\n",
|
| 174 |
+
" codon = seq[i:i+3]\n",
|
| 175 |
+
" if codon == 'ATG' and start is None:\n",
|
| 176 |
+
" start = i\n",
|
| 177 |
+
" if codon in stop_codons and start is not None:\n",
|
| 178 |
+
" orf = seq[start:i+3]\n",
|
| 179 |
+
" if len(orf) > longest_length:\n",
|
| 180 |
+
" longest_orf = orf\n",
|
| 181 |
+
" longest_length = len(orf)\n",
|
| 182 |
+
" start = None\n",
|
| 183 |
+
" \n",
|
| 184 |
+
" # 翻译最长ORF,去除终止符号\n",
|
| 185 |
+
" if longest_orf:\n",
|
| 186 |
+
" protein_sequence = longest_orf.translate(to_stop=True)\n",
|
| 187 |
+
" return str(protein_sequence)\n",
|
| 188 |
+
" else:\n",
|
| 189 |
+
" return \"-\"\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"# 示例DNA序列\n",
|
| 192 |
+
"dna_sequence = \"TCAGTTTTTTGCTTTCGCCGCCGCTTTAACAATGACAGCGAACGCTGACGCTTTTAACGATGCACCACCAACTAATGCACCATCAATATCAGGCTGAGTAAACAATTCTGCTGCATTTGCATCATTGACGGAACCGCCATATTGAATAATTACCCGTTCAGCAACGGCTTGGCTTTGTTTTGCAATATGACCTCGAATAAAGGCATGTACTGCTTGTGCTTGAGCTGGAGTCGCCGATTTACCTGTACCGATAGCCCAAATCGGTTCATAAGCGATTACTGCACCGTTAAATGCTTCAACACCTAGTGTATTCATCACCGCATCAATTTGACGTGCACAAACCTCTTCCGTTTTGCCTGCTTCATTTTCAGCTTCGCTTTCACCGATACATAATACAGGAACTAAACCAGCATCTTTTAACACACCAAATTTTTTCGCAATAAATTCATCACTTTCATGATGATATTGACGTCGCTCAGAATGACCGATAATGACATATTTTACACCAAAGTCTTTTAACATTTCTGTTGAAATATCACCGGTAAATGCACCTTGTTTGTTTAAATCAACATTTTGAGTACCTAAAGCAATATCACTGCTGACCAGTGCAGTTTCAGCTTCCGCTAAATACATGACAGGCGGTGCAATTGCCACATCACAGCCTGACACCGCATTAAGTTCATCTTTTAAACCGGTAATAAGTTCTTTTGTAAAGGCTTTACTACCATTTAATTTCCAGTTACCCATGACTAAAGGACGACGAGCCAT\"\n",
|
| 193 |
+
"\n",
|
| 194 |
+
"# 获取最长的蛋白质序列\n",
|
| 195 |
+
"protein_sequence = find_longest_orf(dna_sequence)\n",
|
| 196 |
+
"print(\"最长的蛋白质序列:\", protein_sequence)\n"
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
{
|
| 200 |
+
"cell_type": "code",
|
| 201 |
+
"execution_count": 13,
|
| 202 |
+
"id": "f42457cf-f946-46e5-9d31-b3c8947c0182",
|
| 203 |
+
"metadata": {},
|
| 204 |
+
"outputs": [],
|
| 205 |
+
"source": [
|
| 206 |
+
"#获得ORF完全匹配的\n",
|
| 207 |
+
"not_match_list1 = []\n",
|
| 208 |
+
"for item in not_match_list:\n",
|
| 209 |
+
" example_dna = item[\"seq_a\"]\n",
|
| 210 |
+
" example_protein = item[\"seq_b\"]\n",
|
| 211 |
+
" label = item[\"label\"]\n",
|
| 212 |
+
" \n",
|
| 213 |
+
" protein_trans = find_longest_orf(example_dna)\n",
|
| 214 |
+
"\n",
|
| 215 |
+
" if 1==label:\n",
|
| 216 |
+
" if example_protein.find(protein_trans[1:-2])==-1:#包含即可,前后可以相差几个字母\n",
|
| 217 |
+
" not_match_list1.append(item)\n",
|
| 218 |
+
" else:\n",
|
| 219 |
+
" pos_select_list.append(item)"
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"cell_type": "code",
|
| 224 |
+
"execution_count": 14,
|
| 225 |
+
"id": "4116c8f9-277d-41eb-bd4b-e66e20336ab5",
|
| 226 |
+
"metadata": {},
|
| 227 |
+
"outputs": [],
|
| 228 |
+
"source": [
|
| 229 |
+
"#ORF匹配较多的\n",
|
| 230 |
+
"from Bio import Align\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"# 创建PairwiseAligner对象\n",
|
| 233 |
+
"aligner = Align.PairwiseAligner()\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"# 设置比对模式为全局比对\n",
|
| 236 |
+
"aligner.mode = \"global\"\n",
|
| 237 |
+
"\n",
|
| 238 |
+
"for item in not_match_list1:\n",
|
| 239 |
+
" example_dna = item[\"seq_a\"]\n",
|
| 240 |
+
" example_protein = item[\"seq_b\"]\n",
|
| 241 |
+
" label = item[\"label\"]\n",
|
| 242 |
+
" \n",
|
| 243 |
+
" protein_trans = find_longest_orf(example_dna)\n",
|
| 244 |
+
"\n",
|
| 245 |
+
" \n",
|
| 246 |
+
" if 1==label:\n",
|
| 247 |
+
"\n",
|
| 248 |
+
" alignments = aligner.align(example_protein, protein_trans)\n",
|
| 249 |
+
" score = alignments[0].score\n",
|
| 250 |
+
" protein_trans_len = len(protein_trans)\n",
|
| 251 |
+
"\n",
|
| 252 |
+
" sim_score = score/protein_trans_len\n",
|
| 253 |
+
"\n",
|
| 254 |
+
" \n",
|
| 255 |
+
" if sim_score > 0.8:#匹配较高的\n",
|
| 256 |
+
" pos_select_list.append(item)"
|
| 257 |
+
]
|
| 258 |
+
},
|
| 259 |
+
{
|
| 260 |
+
"cell_type": "code",
|
| 261 |
+
"execution_count": 15,
|
| 262 |
+
"id": "b7a303cc-1bfc-4fe4-9428-addca7b000ea",
|
| 263 |
+
"metadata": {},
|
| 264 |
+
"outputs": [
|
| 265 |
+
{
|
| 266 |
+
"name": "stdout",
|
| 267 |
+
"output_type": "stream",
|
| 268 |
+
"text": [
|
| 269 |
+
"17067 6309\n"
|
| 270 |
+
]
|
| 271 |
+
}
|
| 272 |
+
],
|
| 273 |
+
"source": [
|
| 274 |
+
"print(len(neg_select_list),len(pos_select_list))"
|
| 275 |
+
]
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"cell_type": "code",
|
| 279 |
+
"execution_count": 9,
|
| 280 |
+
"id": "37ee3361-2fd6-441b-bb59-f355a95debae",
|
| 281 |
+
"metadata": {},
|
| 282 |
+
"outputs": [
|
| 283 |
+
{
|
| 284 |
+
"name": "stdout",
|
| 285 |
+
"output_type": "stream",
|
| 286 |
+
"text": [
|
| 287 |
+
"target 0 MA--ES-REPR-GAVE----A----EL-DPVEYTLRK------R-----L------P-H-\n",
|
| 288 |
+
" 0 |---|--|----||------|----|--|-------|------|-----|------|-|-\n",
|
| 289 |
+
"query 0 M-WQE-AR---HGA--WHRRATSGSE-QD-------KKKFIGGRWSHTPLVQKTTVPVHG\n",
|
| 290 |
+
"\n",
|
| 291 |
+
"target 28 ---RLP--R-RPN-DVYVNMKTDFK-AQL------A----RCQKLLDCG-ARG-------\n",
|
| 292 |
+
" 60 ---|-|--|-|---|------|----|--------|----|-----|---|-|-------\n",
|
| 293 |
+
"query 45 HGER-PTSRGR--QD------T---TA--PRTPSSAGSVPR-----D--VA-GGPPRRLA\n",
|
| 294 |
+
"\n",
|
| 295 |
+
"target 62 Q-SACSEIYIHGLG-L----AIN----RA--INIA-LQLQAGS-F---GALQ--VAAN--\n",
|
| 296 |
+
" 120 |-|-------|-|--|----|------|---|--|-|-|-----|---|-----|-----\n",
|
| 297 |
+
"query 83 QKS-------H-L-WLGYWGA--TLKTR-MWI--AEL-L----RFRITG---SRV---SV\n",
|
| 298 |
+
"\n",
|
| 299 |
+
"target 101 ------TS-TVELVDELEPE--TDT-RE-PVI-------RN-----RNNS--AIHIRVF-\n",
|
| 300 |
+
" 180 ------||-|||-|--|-----|---|--|---------|------|-----|---|---\n",
|
| 301 |
+
"query 118 SGSSSSTSSTVE-V--L---AAT--CR-AP--KLPACSCR-AMLMAR---LMA---R--P\n",
|
| 302 |
+
"\n",
|
| 303 |
+
"target 135 RVAPQ-- 140\n",
|
| 304 |
+
" 240 |--|--- 247\n",
|
| 305 |
+
"query 158 R--P-WM 162\n",
|
| 306 |
+
"\n",
|
| 307 |
+
"比对得分: 55.0\n",
|
| 308 |
+
"查询序列: MWQEARHGAWHRRATSGSEQDKKKFIGGRWSHTPLVQKTTVPVHGHGERPTSRGRQDTTAPRTPSSAGSVPRDVAGGPPRRLAQKSHLWLGYWGATLKTRMWIAELLRFRITGSRVSVSGSSSSTSSTVEVLAATCRAPKLPACSCRAMLMARLMARPRPWM\n",
|
| 309 |
+
"目标序列: MAESREPRGAVEAELDPVEYTLRKRLPHRLPRRPNDVYVNMKTDFKAQLARCQKLLDCGARGQSACSEIYIHGLGLAINRAINIALQLQAGSFGALQVAANTSTVELVDELEPETDTREPVIRNRNNSAIHIRVFRVAPQ\n",
|
| 310 |
+
"-----------------------------------------------------------\n",
|
| 311 |
+
"\n"
|
| 312 |
+
]
|
| 313 |
+
}
|
| 314 |
+
],
|
| 315 |
+
"source": [
|
| 316 |
+
"from Bio import Align\n",
|
| 317 |
+
"\n",
|
| 318 |
+
"# 创建PairwiseAligner对象\n",
|
| 319 |
+
"aligner = Align.PairwiseAligner()\n",
|
| 320 |
+
"\n",
|
| 321 |
+
"# 设置比对模式为全局比对\n",
|
| 322 |
+
"aligner.mode = \"global\"\n",
|
| 323 |
+
"\n",
|
| 324 |
+
"\n",
|
| 325 |
+
"# 示例蛋白质序列\n",
|
| 326 |
+
"seq1 = \"MAESREPRGAVEAELDPVEYTLRKRLPHRLPRRPNDVYVNMKTDFKAQLARCQKLLDCGARGQSACSEIYIHGLGLAINRAINIALQLQAGSFGALQVAANTSTVELVDELEPETDTREPVIRNRNNSAIHIRVFRVAPQ\"\n",
|
| 327 |
+
"seq2 = \"MWQEARHGAWHRRATSGSEQDKKKFIGGRWSHTPLVQKTTVPVHGHGERPTSRGRQDTTAPRTPSSAGSVPRDVAGGPPRRLAQKSHLWLGYWGATLKTRMWIAELLRFRITGSRVSVSGSSSSTSSTVEVLAATCRAPKLPACSCRAMLMARLMARPRPWM\"\n",
|
| 328 |
+
"\n",
|
| 329 |
+
"\n",
|
| 330 |
+
"# 执行比对\n",
|
| 331 |
+
"alignments = aligner.align(seq1, seq2)\n",
|
| 332 |
+
"\n",
|
| 333 |
+
"# 输出比对结果\n",
|
| 334 |
+
"for alignment in alignments:\n",
|
| 335 |
+
" print(alignment)\n",
|
| 336 |
+
" print(f\"比对得分: {alignment.score}\")\n",
|
| 337 |
+
" print(f\"查询序列: {alignment.query}\")\n",
|
| 338 |
+
" print(f\"目标序列: {alignment.target}\")\n",
|
| 339 |
+
" print(\"-----------------------------------------------------------\\n\")\n",
|
| 340 |
+
" break"
|
| 341 |
+
]
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"cell_type": "code",
|
| 345 |
+
"execution_count": null,
|
| 346 |
+
"id": "a3499a28-f01d-4e48-9c49-916763cde800",
|
| 347 |
+
"metadata": {},
|
| 348 |
+
"outputs": [],
|
| 349 |
+
"source": []
|
| 350 |
+
}
|
| 351 |
+
],
|
| 352 |
+
"metadata": {
|
| 353 |
+
"kernelspec": {
|
| 354 |
+
"display_name": "Python 3 (ipykernel)",
|
| 355 |
+
"language": "python",
|
| 356 |
+
"name": "python3"
|
| 357 |
+
},
|
| 358 |
+
"language_info": {
|
| 359 |
+
"codemirror_mode": {
|
| 360 |
+
"name": "ipython",
|
| 361 |
+
"version": 3
|
| 362 |
+
},
|
| 363 |
+
"file_extension": ".py",
|
| 364 |
+
"mimetype": "text/x-python",
|
| 365 |
+
"name": "python",
|
| 366 |
+
"nbconvert_exporter": "python",
|
| 367 |
+
"pygments_lexer": "ipython3",
|
| 368 |
+
"version": "3.11.9"
|
| 369 |
+
}
|
| 370 |
+
},
|
| 371 |
+
"nbformat": 4,
|
| 372 |
+
"nbformat_minor": 5
|
| 373 |
+
}
|
get_data/get_dna_protein_pair_rand.ipynb
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "cabe185c-850a-45be-a1fe-a0913bf921a3",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"#获得dna-蛋白质数据"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": 1,
|
| 16 |
+
"id": "9ff80573-0411-4244-8fdc-488f1592e5cf",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [
|
| 19 |
+
{
|
| 20 |
+
"name": "stdout",
|
| 21 |
+
"output_type": "stream",
|
| 22 |
+
"text": [
|
| 23 |
+
"--2025-02-15 18:51:21-- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz\n",
|
| 24 |
+
" => ‘uniprot_sprot.fasta.gz’\n",
|
| 25 |
+
"Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195\n",
|
| 26 |
+
"Connecting to ftp.uniprot.org (ftp.uniprot.org)|128.175.240.195|:21... connected.\n",
|
| 27 |
+
"Logging in as anonymous ... Logged in!\n",
|
| 28 |
+
"==> SYST ... done. ==> PWD ... done.\n",
|
| 29 |
+
"==> TYPE I ... done. ==> CWD (1) /pub/databases/uniprot/current_release/knowledgebase/complete ... done.\n",
|
| 30 |
+
"==> SIZE uniprot_sprot.fasta.gz ... 92924866\n",
|
| 31 |
+
"==> PASV ... done. ==> RETR uniprot_sprot.fasta.gz ... done.\n",
|
| 32 |
+
"Length: 92924866 (89M) (unauthoritative)\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"uniprot_sprot.fasta 100%[===================>] 88.62M 284KB/s in 3m 38s \n",
|
| 35 |
+
"\n",
|
| 36 |
+
"2025-02-15 18:55:02 (417 KB/s) - ‘uniprot_sprot.fasta.gz’ saved [92924866]\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"tar: This does not look like a tar archive\n",
|
| 39 |
+
"tar: Skipping to next header\n",
|
| 40 |
+
"tar: Exiting with failure status due to previous errors\n"
|
| 41 |
+
]
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"source": [
|
| 45 |
+
"#获得蛋白质fasta数据\n",
|
| 46 |
+
"!wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"cell_type": "code",
|
| 51 |
+
"execution_count": 2,
|
| 52 |
+
"id": "476f187e-7c70-4c19-bb81-9df4b4360529",
|
| 53 |
+
"metadata": {},
|
| 54 |
+
"outputs": [],
|
| 55 |
+
"source": [
|
| 56 |
+
"!gunzip uniprot_sprot.fasta.gz"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"cell_type": "code",
|
| 61 |
+
"execution_count": 3,
|
| 62 |
+
"id": "2c4bf4a6-8f82-4b12-aa66-f5dd89929cf2",
|
| 63 |
+
"metadata": {},
|
| 64 |
+
"outputs": [],
|
| 65 |
+
"source": [
|
| 66 |
+
"!grep \">sp\" uniprot_sprot.fasta|awk -F \"|\" '{print $2}' > uniprot_sprot.fasta.id"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "code",
|
| 71 |
+
"execution_count": 5,
|
| 72 |
+
"id": "36ff4e11-0d8e-46a5-956e-26cac817783b",
|
| 73 |
+
"metadata": {},
|
| 74 |
+
"outputs": [
|
| 75 |
+
{
|
| 76 |
+
"name": "stderr",
|
| 77 |
+
"output_type": "stream",
|
| 78 |
+
"text": [
|
| 79 |
+
"/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/Bio/pairwise2.py:278: BiopythonDeprecationWarning: Bio.pairwise2 has been deprecated, and we intend to remove it in a future release of Biopython. As an alternative, please consider using Bio.Align.PairwiseAligner as a replacement, and contact the Biopython developers if you still need the Bio.pairwise2 module.\n",
|
| 80 |
+
" warnings.warn(\n"
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
],
|
| 84 |
+
"source": [
|
| 85 |
+
"from Bio import Entrez, SeqIO\n",
|
| 86 |
+
"from Bio.Seq import Seq\n",
|
| 87 |
+
"import requests\n",
|
| 88 |
+
"from io import StringIO\n",
|
| 89 |
+
"import re\n",
|
| 90 |
+
"from Bio import pairwise2\n",
|
| 91 |
+
"from Bio.pairwise2 import format_alignment\n",
|
| 92 |
+
"\n",
|
| 93 |
+
"Entrez.email = \"wangliang.f@gmail.com\" #ncbi自己注册一个邮箱。https://www.ncbi.nlm.nih.gov/account/login/"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"cell_type": "code",
|
| 98 |
+
"execution_count": 6,
|
| 99 |
+
"id": "2513d8b9-edfb-4e34-8615-57d291f53557",
|
| 100 |
+
"metadata": {},
|
| 101 |
+
"outputs": [
|
| 102 |
+
{
|
| 103 |
+
"data": {
|
| 104 |
+
"text/plain": [
|
| 105 |
+
"'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL'"
|
| 106 |
+
]
|
| 107 |
+
},
|
| 108 |
+
"execution_count": 6,
|
| 109 |
+
"metadata": {},
|
| 110 |
+
"output_type": "execute_result"
|
| 111 |
+
}
|
| 112 |
+
],
|
| 113 |
+
"source": [
|
| 114 |
+
"#step 1,获得完整的fasta数据\n",
|
| 115 |
+
"def fetch_uniprot_protein_sequence(uniprot_id):\n",
|
| 116 |
+
" url = f\"https://www.uniprot.org/uniprot/{uniprot_id}.fasta\"\n",
|
| 117 |
+
" response = requests.get(url)\n",
|
| 118 |
+
" if response.status_code == 200:\n",
|
| 119 |
+
" fasta_data = response.text\n",
|
| 120 |
+
" record = SeqIO.read(StringIO(fasta_data), \"fasta\")\n",
|
| 121 |
+
" return str(record.seq)\n",
|
| 122 |
+
" else:\n",
|
| 123 |
+
" raise ValueError(f\"未能从 UniProt 获取蛋白质序列,状态码:{response.status_code}\")\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"uniprot_id = \"Q6GZX4\" #第一条数据为例\n",
|
| 126 |
+
"protein_sequence = fetch_uniprot_protein_sequence(uniprot_id)\n",
|
| 127 |
+
"protein_sequence"
|
| 128 |
+
]
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"cell_type": "code",
|
| 132 |
+
"execution_count": 7,
|
| 133 |
+
"id": "01805f1f-213e-4212-8873-2c0a83206840",
|
| 134 |
+
"metadata": {},
|
| 135 |
+
"outputs": [
|
| 136 |
+
{
|
| 137 |
+
"data": {
|
| 138 |
+
"text/plain": [
|
| 139 |
+
"'81941549'"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
"execution_count": 7,
|
| 143 |
+
"metadata": {},
|
| 144 |
+
"output_type": "execute_result"
|
| 145 |
+
}
|
| 146 |
+
],
|
| 147 |
+
"source": [
|
| 148 |
+
"#step 2, 获得ncbi的蛋白质id,注意这个蛋白质id和uniprot的不一样\n",
|
| 149 |
+
"handle = Entrez.esearch(db=\"protein\", term=uniprot_id)\n",
|
| 150 |
+
"record = Entrez.read(handle)\n",
|
| 151 |
+
"handle.close()\n",
|
| 152 |
+
"\n",
|
| 153 |
+
"protein_ncbi_id = record[\"IdList\"][0]\n",
|
| 154 |
+
"protein_ncbi_id"
|
| 155 |
+
]
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"cell_type": "code",
|
| 159 |
+
"execution_count": 21,
|
| 160 |
+
"id": "746deb4d-40ec-4235-9ec4-c45c84889da9",
|
| 161 |
+
"metadata": {},
|
| 162 |
+
"outputs": [
|
| 163 |
+
{
|
| 164 |
+
"name": "stdout",
|
| 165 |
+
"output_type": "stream",
|
| 166 |
+
"text": [
|
| 167 |
+
"AY548484.1\n"
|
| 168 |
+
]
|
| 169 |
+
}
|
| 170 |
+
],
|
| 171 |
+
"source": [
|
| 172 |
+
"#step 3,获得ncbi中的数据使用ncbi id\n",
|
| 173 |
+
"def extract_first_xref_id(info_string):\n",
|
| 174 |
+
" \"\"\"\n",
|
| 175 |
+
" 从给定的字符串中提取 xrefs: 后面的第一个 ID。\n",
|
| 176 |
+
" \n",
|
| 177 |
+
" 参数:\n",
|
| 178 |
+
" - info_string (str): 包含 UniProtKB 信息的字符串\n",
|
| 179 |
+
" \n",
|
| 180 |
+
" 返回:\n",
|
| 181 |
+
" - str 或 None: 如果找到,则返回第一个 ID;否则返回 None。\n",
|
| 182 |
+
" \"\"\"\n",
|
| 183 |
+
" # 使用正则表达式查找 'xrefs:' 后面的第一个 ID\n",
|
| 184 |
+
" match = re.search(r'xrefs:\\s*([\\w.-]+)', info_string)\n",
|
| 185 |
+
" if match:\n",
|
| 186 |
+
" return match.group(1) # 返回匹配到的第一个 ID\n",
|
| 187 |
+
" else:\n",
|
| 188 |
+
" return None\n",
|
| 189 |
+
"\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"def fetch_ncbi_genbank_data(protein_ncbi_id):\n",
|
| 192 |
+
" #STEP 1, 获得protein 数据\n",
|
| 193 |
+
" handle = Entrez.efetch(db=\"protein\", id=protein_ncbi_id, rettype=\"gb\", retmode=\"text\")\n",
|
| 194 |
+
" genbank_data = handle.read()\n",
|
| 195 |
+
" handle.close()\n",
|
| 196 |
+
"\n",
|
| 197 |
+
" #获得dna的id\n",
|
| 198 |
+
" rec = SeqIO.parse(StringIO(genbank_data), \"genbank\")\n",
|
| 199 |
+
" for item in rec:\n",
|
| 200 |
+
" record = item\n",
|
| 201 |
+
" break\n",
|
| 202 |
+
"\n",
|
| 203 |
+
" db_source = record.annotations[\"db_source\"]\n",
|
| 204 |
+
" xref_id = extract_first_xref_id(db_source)\n",
|
| 205 |
+
"\n",
|
| 206 |
+
" print(xref_id)\n",
|
| 207 |
+
" \n",
|
| 208 |
+
"\n",
|
| 209 |
+
" #step2,获得dna数据\n",
|
| 210 |
+
" r_handle = Entrez.efetch(id=xref_id, db='nucleotide', rettype='gb', retmode='text')\n",
|
| 211 |
+
" dna_data = r_handle.read()\n",
|
| 212 |
+
" r_handle.close()\n",
|
| 213 |
+
" \n",
|
| 214 |
+
" return dna_data\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"genbank_data = fetch_ncbi_genbank_data(protein_ncbi_id)\n",
|
| 217 |
+
"#print(genbank_data)"
|
| 218 |
+
]
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"cell_type": "code",
|
| 222 |
+
"execution_count": 15,
|
| 223 |
+
"id": "90be7571-db8d-4bc8-b6ee-0b734a4d112e",
|
| 224 |
+
"metadata": {},
|
| 225 |
+
"outputs": [
|
| 226 |
+
{
|
| 227 |
+
"name": "stdout",
|
| 228 |
+
"output_type": "stream",
|
| 229 |
+
"text": [
|
| 230 |
+
"LOCUS 001R_FRG3G 256 aa linear VRL 08-NOV-2023\n",
|
| 231 |
+
"DEFINITION RecName: Full=Putative transcription factor 001R.\n",
|
| 232 |
+
"ACCESSION Q6GZX4\n",
|
| 233 |
+
"VERSION Q6GZX4.1\n",
|
| 234 |
+
"DBSOURCE UniProtKB: locus 001R_FRG3G, accession Q6GZX4;\n",
|
| 235 |
+
" class: standard.\n",
|
| 236 |
+
" created: Jun 28, 2011.\n",
|
| 237 |
+
" sequence updated: Jul 19, 2004.\n",
|
| 238 |
+
" annotation updated: Nov 8, 2023.\n",
|
| 239 |
+
" xrefs: AY548484.1, AAT09660.1, YP_031579.1\n",
|
| 240 |
+
" xrefs (non-sequence databases): SwissPalm:Q6GZX4, GeneID:2947773,\n",
|
| 241 |
+
" KEGG:vg:2947773, Proteomes:UP000008770, GO:0046782,\n",
|
| 242 |
+
" InterPro:IPR007031, Pfam:PF04947\n",
|
| 243 |
+
"KEYWORDS Activator; Reference proteome; Transcription; Transcription\n",
|
| 244 |
+
" regulation.\n",
|
| 245 |
+
"SOURCE Frog virus 3 (isolate Goorha)\n",
|
| 246 |
+
" ORGANISM Frog virus 3 (isolate Goorha)\n",
|
| 247 |
+
" Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota;\n",
|
| 248 |
+
" Megaviricetes; Pimascovirales; Iridoviridae; Alphairidovirinae;\n",
|
| 249 |
+
" Ranavirus; Frog virus 3.\n",
|
| 250 |
+
"REFERENCE 1 (residues 1 to 256)\n",
|
| 251 |
+
" AUTHORS Tan,W.G., Barkman,T.J., Gregory Chinchar,V. and Essani,K.\n",
|
| 252 |
+
" TITLE Comparative genomic analyses of frog virus 3, type species of the\n",
|
| 253 |
+
" genus Ranavirus (family Iridoviridae)\n",
|
| 254 |
+
" JOURNAL Virology 323 (1), 70-84 (2004)\n",
|
| 255 |
+
" PUBMED 15165820\n",
|
| 256 |
+
" REMARK NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n",
|
| 257 |
+
"COMMENT [FUNCTION] Transcription activation. {ECO:0000305}.\n",
|
| 258 |
+
"FEATURES Location/Qualifiers\n",
|
| 259 |
+
" source 1..256\n",
|
| 260 |
+
" /organism=\"Frog virus 3 (isolate Goorha)\"\n",
|
| 261 |
+
" /host=\"Dryophytes versicolor (chameleon treefrog)\"\n",
|
| 262 |
+
" /host=\"Lithobates pipiens (Northern leopard frog) (Rana\n",
|
| 263 |
+
" pipiens)\"\n",
|
| 264 |
+
" /host=\"Lithobates sylvaticus (Wood frog) (Rana sylvatica)\"\n",
|
| 265 |
+
" /host=\"Notophthalmus viridescens (Eastern newt) (Triturus\n",
|
| 266 |
+
" viridescens)\"\n",
|
| 267 |
+
" /db_xref=\"taxon:654924\"\n",
|
| 268 |
+
" gene 1..256\n",
|
| 269 |
+
" /locus_tag=\"FV3-001R\"\n",
|
| 270 |
+
" Protein 1..256\n",
|
| 271 |
+
" /product=\"Putative transcription factor 001R\"\n",
|
| 272 |
+
" /UniProtKB_evidence=\"Predicted\"\n",
|
| 273 |
+
" Region 1..256\n",
|
| 274 |
+
" /region_name=\"Mature chain\"\n",
|
| 275 |
+
" /note=\"Putative transcription factor 001R.\n",
|
| 276 |
+
" /id=PRO_0000410512.\"\n",
|
| 277 |
+
" Region 81..253\n",
|
| 278 |
+
" /region_name=\"Pox_VLTF3\"\n",
|
| 279 |
+
" /note=\"Poxvirus Late Transcription Factor VLTF3 like;\n",
|
| 280 |
+
" pfam04947\"\n",
|
| 281 |
+
" /db_xref=\"CDD:282761\"\n",
|
| 282 |
+
"ORIGIN \n",
|
| 283 |
+
" 1 mafsaedvlk eydrrrrmea lllslyypnd rklldykews pprvqvecpk apvewnnpps\n",
|
| 284 |
+
" 61 ekglivghfs gikykgekaq asevdvnkmc cwvskfkdam rryqgiqtck ipgkvlsdld\n",
|
| 285 |
+
" 121 akikaynltv egvegfvrys rvtkqhvaaf lkelrhskqy envnlihyil tdkrvdiqhl\n",
|
| 286 |
+
" 181 ekdlvkdfka lvesahrmrq ghminvkyil yqllkkhghg pdgpdiltvk tgskgvlydd\n",
|
| 287 |
+
" 241 sfrkiytdlg wkftpl\n",
|
| 288 |
+
"//\n",
|
| 289 |
+
"\n",
|
| 290 |
+
"\n"
|
| 291 |
+
]
|
| 292 |
+
}
|
| 293 |
+
],
|
| 294 |
+
"source": [
|
| 295 |
+
"#分步测试,STEP 1, 获得protein 数据 这个里面应该有对应的dna数据CDS的,但其实没有。。\n",
|
| 296 |
+
"handle = Entrez.efetch(db=\"protein\", id=\"81941549\", rettype=\"gb\", retmode=\"text\")\n",
|
| 297 |
+
"genbank_data_protein = handle.read()\n",
|
| 298 |
+
"handle.close()\n",
|
| 299 |
+
"print(genbank_data_protein) #需要其中的db_source中xrefs里面的数据中的第1个,也就是AY548484.1"
|
| 300 |
+
]
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"cell_type": "code",
|
| 304 |
+
"execution_count": 17,
|
| 305 |
+
"id": "1b3c03e0-ea22-4ec4-aa4d-4994485114f1",
|
| 306 |
+
"metadata": {},
|
| 307 |
+
"outputs": [],
|
| 308 |
+
"source": [
|
| 309 |
+
"def calculate_similarity(seq1, seq2):\n",
|
| 310 |
+
" \"\"\"使用局部比对计算两个序列之间的相似度得分\"\"\"\n",
|
| 311 |
+
" alignments = pairwise2.align.localxx(seq1, seq2)\n",
|
| 312 |
+
" best_score = max(aln.score for aln in alignments) if alignments else 0\n",
|
| 313 |
+
" return best_score\n",
|
| 314 |
+
"\n",
|
| 315 |
+
"def extract_cds_and_translate(genbank_data, protein_seq_ori=None):\n",
|
| 316 |
+
" results = []\n",
|
| 317 |
+
" record = None\n",
|
| 318 |
+
"\n",
|
| 319 |
+
" # 使用 StringIO 加载 GenBank 数据\n",
|
| 320 |
+
" for rec in SeqIO.parse(StringIO(genbank_data), \"genbank\"):\n",
|
| 321 |
+
" record = rec\n",
|
| 322 |
+
" break\n",
|
| 323 |
+
"\n",
|
| 324 |
+
" \n",
|
| 325 |
+
" if not record:\n",
|
| 326 |
+
" raise ValueError(\"未能成功解析 GenBank 数据\")\n",
|
| 327 |
+
"\n",
|
| 328 |
+
" gene_id = record.id\n",
|
| 329 |
+
"\n",
|
| 330 |
+
" for feature in record.features:\n",
|
| 331 |
+
" if feature.type == \"CDS\":\n",
|
| 332 |
+
" cds_start = feature.location.start\n",
|
| 333 |
+
" cds_end = feature.location.end\n",
|
| 334 |
+
" cds_sequence = record.seq[cds_start:cds_end]\n",
|
| 335 |
+
" protein_sequence = feature.qualifiers[\"translation\"][0]\n",
|
| 336 |
+
" protein_id = feature.qualifiers[\"protein_id\"][0]\n",
|
| 337 |
+
" \n",
|
| 338 |
+
" sim_score = calculate_similarity(protein_sequence, protein_seq_ori)\n",
|
| 339 |
+
" results.append({\n",
|
| 340 |
+
" \"protein_id\":protein_id,\n",
|
| 341 |
+
" \"gene_id\": gene_id ,\n",
|
| 342 |
+
" \"cds_start\": cds_start,\n",
|
| 343 |
+
" \"cds_end\": cds_end,\n",
|
| 344 |
+
" \"dna_sequence\": str(cds_sequence),\n",
|
| 345 |
+
" \"protein_sequence\": str(protein_sequence),\n",
|
| 346 |
+
" \"sim\":sim_score\n",
|
| 347 |
+
" })\n",
|
| 348 |
+
" # 使用 sorted() 函数并指定 key 和 reverse 参数\n",
|
| 349 |
+
" sorted_results = sorted(results, key=lambda x: x['sim'], reverse=True)\n",
|
| 350 |
+
" return sorted_results\n",
|
| 351 |
+
"\n",
|
| 352 |
+
"cds_data_list = extract_cds_and_translate(genbank_data, protein_sequence)"
|
| 353 |
+
]
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"cell_type": "code",
|
| 357 |
+
"execution_count": 19,
|
| 358 |
+
"id": "fdd02166-9af8-433e-a6c6-051482759623",
|
| 359 |
+
"metadata": {},
|
| 360 |
+
"outputs": [
|
| 361 |
+
{
|
| 362 |
+
"data": {
|
| 363 |
+
"text/plain": [
|
| 364 |
+
"{'protein_id': 'AAT09660.1',\n",
|
| 365 |
+
" 'gene_id': 'AY548484.1',\n",
|
| 366 |
+
" 'cds_start': ExactPosition(271),\n",
|
| 367 |
+
" 'cds_end': ExactPosition(1042),\n",
|
| 368 |
+
" 'dna_sequence': 'ATGGCATTCTCGGCAGAAGATGTGCTGAAGGAGTACGACAGGAGACGGAGGATGGAGGCCCTCTTGCTCAGCCTGTACTACCCAAACGACCGCAAGCTCCTAGACTACAAAGAGTGGTCTCCGCCCAGGGTTCAGGTAGAGTGTCCCAAAGCCCCCGTGGAGTGGAACAACCCTCCGTCAGAAAAGGGTCTCATCGTGGGGCACTTTAGCGGCATAAAGTACAAGGGGGAAAAGGCTCAGGCATCCGAGGTAGACGTCAACAAGATGTGCTGCTGGGTGTCCAAGTTTAAAGACGCCATGAGGAGGTACCAGGGCATACAGACTTGCAAGATCCCCGGCAAGGTCCTGTCGGACCTCGACGCCAAAATAAAGGCTTACAACCTCACCGTTGAGGGCGTAGAGGGTTTCGTGAGGTACTCACGAGTGACCAAGCAGCACGTAGCAGCTTTCCTCAAGGAGCTCAGGCACTCTAAGCAGTACGAAAACGTCAACCTCATCCACTACATCCTCACCGACAAGAGGGTAGACATTCAGCACCTGGAAAAGGATCTTGTCAAGGATTTTAAGGCGCTGGTGGAATCTGCTCACAGGATGAGGCAGGGCCACATGATCAACGTAAAGTACATACTCTACCAGCTCCTCAAGAAGCACGGTCACGGGCCAGACGGTCCAGACATCCTGACCGTAAAGACTGGAAGCAAGGGAGTCTTGTACGACGATTCCTTTCGCAAGATTTACACGGACCTCGGGTGGAAGTTTACCCCCCTATGA',\n",
|
| 369 |
+
" 'protein_sequence': 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL',\n",
|
| 370 |
+
" 'sim': 256.0}"
|
| 371 |
+
]
|
| 372 |
+
},
|
| 373 |
+
"execution_count": 19,
|
| 374 |
+
"metadata": {},
|
| 375 |
+
"output_type": "execute_result"
|
| 376 |
+
}
|
| 377 |
+
],
|
| 378 |
+
"source": [
|
| 379 |
+
"cds_data_list[0]"
|
| 380 |
+
]
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"cell_type": "code",
|
| 384 |
+
"execution_count": null,
|
| 385 |
+
"id": "58031699-a779-44f8-b88e-2efb6b53d757",
|
| 386 |
+
"metadata": {},
|
| 387 |
+
"outputs": [],
|
| 388 |
+
"source": []
|
| 389 |
+
}
|
| 390 |
+
],
|
| 391 |
+
"metadata": {
|
| 392 |
+
"kernelspec": {
|
| 393 |
+
"display_name": "Python 3 (ipykernel)",
|
| 394 |
+
"language": "python",
|
| 395 |
+
"name": "python3"
|
| 396 |
+
},
|
| 397 |
+
"language_info": {
|
| 398 |
+
"codemirror_mode": {
|
| 399 |
+
"name": "ipython",
|
| 400 |
+
"version": 3
|
| 401 |
+
},
|
| 402 |
+
"file_extension": ".py",
|
| 403 |
+
"mimetype": "text/x-python",
|
| 404 |
+
"name": "python",
|
| 405 |
+
"nbconvert_exporter": "python",
|
| 406 |
+
"pygments_lexer": "ipython3",
|
| 407 |
+
"version": "3.11.9"
|
| 408 |
+
}
|
| 409 |
+
},
|
| 410 |
+
"nbformat": 4,
|
| 411 |
+
"nbformat_minor": 5
|
| 412 |
+
}
|
get_data/get_lucaone_data.ipynb
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "36cb4bfb-c6e0-4924-a91d-57dfccd63801",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"#获得lucaone的中心法则相关数据"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"execution_count": 2,
|
| 16 |
+
"id": "01c3e3cc-7f70-4f9f-89f4-ed236845e64e",
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [
|
| 19 |
+
{
|
| 20 |
+
"name": "stdout",
|
| 21 |
+
"output_type": "stream",
|
| 22 |
+
"text": [
|
| 23 |
+
"--2025-02-15 11:56:31-- http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/train/train.csv\n",
|
| 24 |
+
"Connecting to 47.93.21.181:80... connected.\n",
|
| 25 |
+
"HTTP request sent, awaiting response... 200 OK\n",
|
| 26 |
+
"Length: 5302028 (5.1M) [application/octet-stream]\n",
|
| 27 |
+
"Saving to: ‘train.csv’\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"train.csv 100%[===================>] 5.06M 12.7MB/s in 0.4s \n",
|
| 30 |
+
"\n",
|
| 31 |
+
"2025-02-15 11:56:31 (12.7 MB/s) - ‘train.csv’ saved [5302028/5302028]\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"--2025-02-15 11:56:31-- http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/test/test.csv\n",
|
| 34 |
+
"Connecting to 47.93.21.181:80... connected.\n",
|
| 35 |
+
"HTTP request sent, awaiting response... 200 OK\n",
|
| 36 |
+
"Length: 33131633 (32M) [application/octet-stream]\n",
|
| 37 |
+
"Saving to: ‘test.csv’\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"test.csv 100%[===================>] 31.60M 9.59MB/s in 3.3s \n",
|
| 40 |
+
"\n",
|
| 41 |
+
"2025-02-15 11:56:35 (9.59 MB/s) - ‘test.csv’ saved [33131633/33131633]\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"--2025-02-15 11:56:35-- http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/dev/dev.csv\n",
|
| 44 |
+
"Connecting to 47.93.21.181:80... connected.\n",
|
| 45 |
+
"HTTP request sent, awaiting response... 200 OK\n",
|
| 46 |
+
"Length: 3982657 (3.8M) [application/octet-stream]\n",
|
| 47 |
+
"Saving to: ‘dev.csv’\n",
|
| 48 |
+
"\n",
|
| 49 |
+
"dev.csv 100%[===================>] 3.80M 8.46MB/s in 0.4s \n",
|
| 50 |
+
"\n",
|
| 51 |
+
"2025-02-15 11:56:35 (8.46 MB/s) - ‘dev.csv’ saved [3982657/3982657]\n",
|
| 52 |
+
"\n"
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"source": [
|
| 57 |
+
"!wget http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/train/train.csv\n",
|
| 58 |
+
"!wget http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/test/test.csv\n",
|
| 59 |
+
"!wget http://47.93.21.181/lucaone/DownstreamTasksDataset/dataset/CentralDogma/gene_protein/binary_class/dev/dev.csv"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "code",
|
| 64 |
+
"execution_count": 3,
|
| 65 |
+
"id": "23d39250-50a6-4557-9cb9-2d637ce52835",
|
| 66 |
+
"metadata": {},
|
| 67 |
+
"outputs": [],
|
| 68 |
+
"source": [
|
| 69 |
+
"#合并文件\n",
|
| 70 |
+
"# 先复制第一个文件的内容(包括表头)\n",
|
| 71 |
+
"!cp train.csv central_dogma.csv\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"# 然后将后续文件的内容(跳过表头)追加到combined.csv中\n",
|
| 74 |
+
"!tail -n +2 test.csv >> central_dogma.csv\n",
|
| 75 |
+
"!tail -n +2 dev.csv >> central_dogma.csv"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"cell_type": "code",
|
| 80 |
+
"execution_count": null,
|
| 81 |
+
"id": "d54081eb-d30c-4b3f-8a6d-0d1b53c37fd7",
|
| 82 |
+
"metadata": {},
|
| 83 |
+
"outputs": [],
|
| 84 |
+
"source": []
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"metadata": {
|
| 88 |
+
"kernelspec": {
|
| 89 |
+
"display_name": "Python 3 (ipykernel)",
|
| 90 |
+
"language": "python",
|
| 91 |
+
"name": "python3"
|
| 92 |
+
},
|
| 93 |
+
"language_info": {
|
| 94 |
+
"codemirror_mode": {
|
| 95 |
+
"name": "ipython",
|
| 96 |
+
"version": 3
|
| 97 |
+
},
|
| 98 |
+
"file_extension": ".py",
|
| 99 |
+
"mimetype": "text/x-python",
|
| 100 |
+
"name": "python",
|
| 101 |
+
"nbconvert_exporter": "python",
|
| 102 |
+
"pygments_lexer": "ipython3",
|
| 103 |
+
"version": "3.11.9"
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"nbformat": 4,
|
| 107 |
+
"nbformat_minor": 5
|
| 108 |
+
}
|
get_data/get_protein_dna_pair.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Bio import Entrez, SeqIO
|
| 2 |
+
from Bio.Seq import Seq
|
| 3 |
+
import requests
|
| 4 |
+
from io import StringIO
|
| 5 |
+
import re
|
| 6 |
+
from Bio import pairwise2
|
| 7 |
+
from Bio.pairwise2 import format_alignment
|
| 8 |
+
|
| 9 |
+
Entrez.email = "wangliang.f@gmail.com"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def calculate_similarity(seq1, seq2):
|
| 13 |
+
"""使用局部比对计算两个序列之间的相似度得分"""
|
| 14 |
+
alignments = pairwise2.align.localxx(seq1, seq2)
|
| 15 |
+
best_score = max(aln.score for aln in alignments) if alignments else 0
|
| 16 |
+
return best_score
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def extract_first_xref_id(info_string):
|
| 21 |
+
"""
|
| 22 |
+
从给定的字符串中提取 xrefs: 后面的第一个 ID。
|
| 23 |
+
|
| 24 |
+
参数:
|
| 25 |
+
- info_string (str): 包含 UniProtKB 信息的字符串
|
| 26 |
+
|
| 27 |
+
返回:
|
| 28 |
+
- str 或 None: 如果找到,则返回第一个 ID;否则返回 None。
|
| 29 |
+
"""
|
| 30 |
+
# 使用正则表达式查找 'xrefs:' 后面的第一个 ID
|
| 31 |
+
match = re.search(r'xrefs:\s*([\w.-]+)', info_string)
|
| 32 |
+
if match:
|
| 33 |
+
return match.group(1) # 返回匹配到的第一个 ID
|
| 34 |
+
else:
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def fetch_uniprot_protein_sequence(uniprot_id):
|
| 40 |
+
url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
|
| 41 |
+
response = requests.get(url)
|
| 42 |
+
if response.status_code == 200:
|
| 43 |
+
fasta_data = response.text
|
| 44 |
+
record = SeqIO.read(StringIO(fasta_data), "fasta")
|
| 45 |
+
return str(record.seq)
|
| 46 |
+
else:
|
| 47 |
+
raise ValueError(f"未能从 UniProt 获取蛋白质序列,状态码:{response.status_code}")
|
| 48 |
+
|
| 49 |
+
def fetch_ncbi_genbank_data(protein_ncbi_id):
|
| 50 |
+
#STEP 1, 获得protein 数据
|
| 51 |
+
handle = Entrez.efetch(db="protein", id=protein_ncbi_id, rettype="gb", retmode="text")
|
| 52 |
+
genbank_data = handle.read()
|
| 53 |
+
handle.close()
|
| 54 |
+
|
| 55 |
+
#获得dna的id
|
| 56 |
+
rec = SeqIO.parse(StringIO(genbank_data), "genbank")
|
| 57 |
+
for item in rec:
|
| 58 |
+
record = item
|
| 59 |
+
break
|
| 60 |
+
|
| 61 |
+
#protein_seq = str(record.seq)
|
| 62 |
+
db_source = record.annotations["db_source"]
|
| 63 |
+
#print("db_source", db_source)
|
| 64 |
+
xref_id = extract_first_xref_id(db_source)
|
| 65 |
+
print("xref_id", xref_id)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
#step2,获得dna数据
|
| 69 |
+
r_handle = Entrez.efetch(id=xref_id, db='nucleotide', rettype='gb', retmode='text')
|
| 70 |
+
dna_data = r_handle.read()
|
| 71 |
+
r_handle.close()
|
| 72 |
+
|
| 73 |
+
return dna_data
|
| 74 |
+
|
| 75 |
+
def extract_cds_and_translate(genbank_data, protein_seq_ori=None):
|
| 76 |
+
results = []
|
| 77 |
+
record = None
|
| 78 |
+
|
| 79 |
+
# 使用 StringIO 加载 GenBank 数据
|
| 80 |
+
for rec in SeqIO.parse(StringIO(genbank_data), "genbank"):
|
| 81 |
+
record = rec
|
| 82 |
+
break
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if not record:
|
| 86 |
+
raise ValueError("未能成功解析 GenBank 数据")
|
| 87 |
+
|
| 88 |
+
gene_id = record.id
|
| 89 |
+
|
| 90 |
+
for feature in record.features:
|
| 91 |
+
if feature.type == "CDS":
|
| 92 |
+
#protein_sequence = features.qualifiers["translation"][0]
|
| 93 |
+
# if protein_id and "protein_id" in feature.qualifiers:
|
| 94 |
+
# if protein_id != feature.qualifiers["protein_id"][0]:
|
| 95 |
+
# continue
|
| 96 |
+
|
| 97 |
+
cds_start = feature.location.start
|
| 98 |
+
cds_end = feature.location.end
|
| 99 |
+
cds_sequence = record.seq[cds_start:cds_end]
|
| 100 |
+
#protein_sequence = cds_sequence.translate(to_stop=True)
|
| 101 |
+
protein_sequence = feature.qualifiers["translation"][0]
|
| 102 |
+
protein_id = feature.qualifiers["protein_id"][0]
|
| 103 |
+
|
| 104 |
+
sim_score = calculate_similarity(protein_sequence, protein_seq_ori)
|
| 105 |
+
#if p_s==protein_seq: #只要1个
|
| 106 |
+
results.append({
|
| 107 |
+
"gene_id":gene_id,
|
| 108 |
+
"protein_id":protein_id,
|
| 109 |
+
"cds_start": cds_start,
|
| 110 |
+
"cds_end": cds_end,
|
| 111 |
+
"dna_sequence": str(cds_sequence),
|
| 112 |
+
"protein_sequence": str(protein_sequence),
|
| 113 |
+
"sim":sim_score
|
| 114 |
+
})
|
| 115 |
+
# 使用 sorted() 函数并指定 key 和 reverse 参数
|
| 116 |
+
sorted_results = sorted(results, key=lambda x: x['sim'], reverse=True)
|
| 117 |
+
return sorted_results
|
| 118 |
+
|
| 119 |
+
def get_protein_and_dna_sequences(uniprot_id):
|
| 120 |
+
"""
|
| 121 |
+
主函数,根据蛋白质id,获得dna和蛋白质匹配对
|
| 122 |
+
:param uniprot_id:
|
| 123 |
+
:return:
|
| 124 |
+
"""
|
| 125 |
+
try:
|
| 126 |
+
print(f"正在获取 UniProt ID: {uniprot_id} 的蛋白质序列...")
|
| 127 |
+
protein_sequence = fetch_uniprot_protein_sequence(uniprot_id)
|
| 128 |
+
|
| 129 |
+
print("正在获取 NCBI 数据...")
|
| 130 |
+
handle = Entrez.esearch(db="protein", term=uniprot_id)
|
| 131 |
+
record = Entrez.read(handle)
|
| 132 |
+
handle.close()
|
| 133 |
+
|
| 134 |
+
if not record["IdList"]:
|
| 135 |
+
raise ValueError("未找到对应的蛋白质记录")
|
| 136 |
+
|
| 137 |
+
protein_ncbi_id = record["IdList"][0]
|
| 138 |
+
|
| 139 |
+
#print("protein_ncbi_id", protein_ncbi_id)
|
| 140 |
+
|
| 141 |
+
genbank_data = fetch_ncbi_genbank_data(protein_ncbi_id)
|
| 142 |
+
|
| 143 |
+
#print("genbank_data", genbank_data)
|
| 144 |
+
|
| 145 |
+
print("正在提取 DNA 和蛋白质序列...")
|
| 146 |
+
cds_data = extract_cds_and_translate(genbank_data, protein_sequence)
|
| 147 |
+
|
| 148 |
+
return {
|
| 149 |
+
"uniprot_id": uniprot_id,
|
| 150 |
+
"protein_sequence": protein_sequence,
|
| 151 |
+
"cds_data": cds_data,
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
print(f"发生错误:{e}")
|
| 156 |
+
return {"error": str(e)}
|
| 157 |
+
|
| 158 |
+
def process_data(uniprot_id):
|
| 159 |
+
# 示例:获取蛋白质和 DNA 序列
|
| 160 |
+
result = get_protein_and_dna_sequences(uniprot_id)
|
| 161 |
+
if "error" in result:
|
| 162 |
+
print(f"错误:{result['error']}")
|
| 163 |
+
return -1
|
| 164 |
+
else:
|
| 165 |
+
#print(result)
|
| 166 |
+
if len(result["cds_data"])>0:
|
| 167 |
+
gene_data = result["cds_data"][0]
|
| 168 |
+
data = {
|
| 169 |
+
"seq_id_a":gene_data["gene_id"],
|
| 170 |
+
"seq_type_a":"gene",
|
| 171 |
+
"seq_a":gene_data["dna_sequence"],
|
| 172 |
+
"seq_id_b":uniprot_id,
|
| 173 |
+
"seq_type_b":"pro",
|
| 174 |
+
"seq_b":gene_data["protein_sequence"],
|
| 175 |
+
"protein_id":gene_data["protein_id"],
|
| 176 |
+
}
|
| 177 |
+
return data
|
| 178 |
+
else:
|
| 179 |
+
return -1
|
| 180 |
+
|
| 181 |
+
if __name__=="__main__":
|
| 182 |
+
ret = process_data("Q9Z3S1")
|
| 183 |
+
print(ret)
|