{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "f3ad096d-492a-4da2-a390-03c7e7453821", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Generating train split: 25600 examples [00:00, 82207.06 examples/s]\n" ] }, { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['seq_id_a', 'seq_id_b', 'seq_type_a', 'seq_type_b', 'seq_a', 'seq_b', 'label'],\n", " num_rows: 25600\n", " })\n", "})" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset\n", "from transformers import AutoTokenizer, DataCollatorWithPadding\n", "from transformers import Trainer\n", "import evaluate\n", "import numpy as np\n", "from transformers import TrainingArguments\n", "from transformers import AutoModelForSequenceClassification\n", "\n", "raw_datasets = load_dataset('csv', data_files='central_dogma.csv')\n", "raw_datasets" ] }, { "cell_type": "code", "execution_count": 3, "id": "8d89ca1d-1968-43d3-8d47-f9f87b02cd02", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'LKIELASSYGFCFGVKRAIKIAENAGDAATIGPLIHNNEEINRLATNFNVKTLHGINELKDEKKAIIRTHGITKSDLAELKKTDIKVIDATCPFVTKPQQICEDMSNAGYDVVIFGDENHPEVKGVKSYASGKVYVVLDESELEGVKFRQKVALVSQTTRKVEKFMQIANYLMLRVKEVRVFNTICNATFENQEAVKNLAKRADVMIVIGGKNSSNTKQLYLISKNFCEDSYLIESEHEVEKSWFEGKNLCGISAGASTPDWIIQKVVDAIEKF*'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from Bio.Seq import Seq\n", "\n", "def translate_dna_to_protein_biopython(dna_sequence):\n", " \"\"\"Translate a DNA sequence into its corresponding protein sequence using Biopython.\"\"\"\n", " # 确保输入的是大写的DNA序列\n", " dna_seq = Seq(dna_sequence.upper())\n", " \n", " # 使用Biopython内置方法进行翻译\n", " protein_seq = dna_seq.translate(to_stop=False) # 如果需要在终止密码子处停止,请设置to_stop=True\n", " \n", " return str(protein_seq)\n", "\n", "def trim_sequence(sequence, front, length):\n", " \"\"\"Trim the specified number of characters from the start and end of a string.\"\"\"\n", " # if len(sequence) <= front + back:\n", " # raise ValueError(\"The sequence is too short to trim the specified number of characters.\")\n", " # return sequence[front:-back] if back > 0 else sequence[front:]\n", " return sequence[front:front+length]\n", "\n", "translate_dna_to_protein_biopython(\"TTGAAGATTGAGCTTGCTAGCAGCTACGGCTTTTGCTTTGGGGTAAAGCGCGCCATAAAGATAGCCGAAAATGCGGGCGATGCCGCTACTATCGGGCCTCTCATACATAATAACGAAGAGATAAACCGCCTGGCTACGAATTTCAATGTCAAGACCCTCCACGGCATAAATGAGCTAAAGGACGAGAAAAAGGCCATCATACGCACTCACGGTATCACAAAAAGCGATCTGGCCGAGCTTAAAAAGACCGATATCAAAGTCATAGACGCCACTTGCCCGTTCGTGACCAAGCCGCAGCAAATTTGCGAGGATATGAGCAACGCAGGATACGATGTCGTGATATTTGGCGATGAAAATCATCCCGAAGTCAAAGGAGTGAAGTCCTATGCCAGCGGAAAGGTTTATGTCGTGCTCGATGAGAGCGAGCTTGAGGGAGTGAAATTTAGACAAAAGGTAGCACTCGTCAGTCAAACGACGCGCAAAGTCGAAAAATTTATGCAAATAGCGAACTACTTGATGCTACGCGTCAAAGAGGTGCGAGTTTTCAACACTATCTGCAACGCGACCTTCGAGAATCAGGAGGCGGTCAAAAATTTAGCCAAAAGAGCCGATGTGATGATAGTCATCGGTGGTAAAAATAGCTCTAATACAAAGCAGCTTTATCTGATATCTAAAAATTTCTGCGAGGACAGCTACCTGATAGAGAGCGAACACGAAGTCGAGAAAAGCTGGTTTGAAGGCAAGAATTTATGCGGTATAAGTGCGGGAGCGAGCACGCCTGATTGGATCATACAAAAAGTCGTCGACGCGATAGAGAAATTTTAA\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "682a14ef-7492-434e-bed0-af0f5c03db86", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/maris/miniconda3/envs/dnagpt/lib/python3.11/site-packages/Bio/Seq.py:2879: BiopythonWarning: Partial codon, len(sequence) not a multiple of three. Explicitly trim the sequence or add trailing N before translation. This may become an error in future.\n", " warnings.warn(\n" ] } ], "source": [ "#获得完全匹配的正例\n", "not_match_list = []\n", "pos_select_list = [] #正例\n", "neg_select_list = [] #负例\n", "\n", "for item in raw_datasets[\"train\"]:\n", " example_dna = item[\"seq_a\"]\n", " example_protein = item[\"seq_b\"]\n", " label = item[\"label\"]\n", "\n", " if 0==label: #负例都要\n", " neg_select_list.append(item)\n", " \n", " dna_length = len(example_protein)*3\n", " trimmed_sequence = trim_sequence(example_dna,100,dna_length)\n", " protein_trans = translate_dna_to_protein_biopython(trimmed_sequence)\n", "\n", " if 1==label:\n", " if protein_trans[1:-2]!=example_protein[1:-2]: #运行有前后1个字符不一样的\n", " not_match_list.append(item)\n", " else:\n", " pos_select_list.append(item)" ] }, { "cell_type": "code", "execution_count": 12, "id": "9abb1e66-5d3a-4d87-9bf3-577cec8efcf9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "最长的蛋白质序列: MARRPLVMGNWKLNGSKAFTKELITGLKDELNAVSGCDVAIAPPVMYLAEAETALVSSDIALGTQNVDLNKQGAFTGDISTEMLKDFGVKYVIIGHSERRQYHHESDEFIAKKFGVLKDAGLVPVLCIGESEAENEAGKTEEVCARQIDAVMNTLGVEAFNGAVIAYEPIWAIGTGKSATPAQAQAVHAFIRGHIAKQSQAVAERVIIQYGGSVNDANAAELFTQPDIDGALVGGASLKASAFAVIVKAAAKAKN\n" ] } ], "source": [ "from Bio.Seq import Seq\n", "\n", "def find_longest_orf(dna_sequence_str):\n", " dna_sequence = Seq(dna_sequence_str)\n", " # 定义可能的终止密码子\n", " stop_codons = ['TAA', 'TAG', 'TGA']\n", " \n", " # 初始化最长ORF和其长度\n", " longest_orf = ''\n", " longest_length = 0\n", " \n", " # 遍历正向链的三个阅读框\n", " for frame in range(3):\n", " seq = dna_sequence[frame:]\n", " start = None\n", " for i in range(0, len(seq) - 2, 3):\n", " codon = seq[i:i+3]\n", " if codon == 'ATG' and start is None:\n", " start = i\n", " if codon in stop_codons and start is not None:\n", " orf = seq[start:i+3]\n", " if len(orf) > longest_length:\n", " longest_orf = orf\n", " longest_length = len(orf)\n", " start = None\n", " \n", " # 遍历反向互补链的三个阅读框\n", " rev_seq = dna_sequence.reverse_complement()\n", " for frame in range(3):\n", " seq = rev_seq[frame:]\n", " start = None\n", " for i in range(0, len(seq) - 2, 3):\n", " codon = seq[i:i+3]\n", " if codon == 'ATG' and start is None:\n", " start = i\n", " if codon in stop_codons and start is not None:\n", " orf = seq[start:i+3]\n", " if len(orf) > longest_length:\n", " longest_orf = orf\n", " longest_length = len(orf)\n", " start = None\n", " \n", " # 翻译最长ORF,去除终止符号\n", " if longest_orf:\n", " protein_sequence = longest_orf.translate(to_stop=True)\n", " return str(protein_sequence)\n", " else:\n", " return \"-\"\n", "\n", "# 示例DNA序列\n", "dna_sequence = \"TCAGTTTTTTGCTTTCGCCGCCGCTTTAACAATGACAGCGAACGCTGACGCTTTTAACGATGCACCACCAACTAATGCACCATCAATATCAGGCTGAGTAAACAATTCTGCTGCATTTGCATCATTGACGGAACCGCCATATTGAATAATTACCCGTTCAGCAACGGCTTGGCTTTGTTTTGCAATATGACCTCGAATAAAGGCATGTACTGCTTGTGCTTGAGCTGGAGTCGCCGATTTACCTGTACCGATAGCCCAAATCGGTTCATAAGCGATTACTGCACCGTTAAATGCTTCAACACCTAGTGTATTCATCACCGCATCAATTTGACGTGCACAAACCTCTTCCGTTTTGCCTGCTTCATTTTCAGCTTCGCTTTCACCGATACATAATACAGGAACTAAACCAGCATCTTTTAACACACCAAATTTTTTCGCAATAAATTCATCACTTTCATGATGATATTGACGTCGCTCAGAATGACCGATAATGACATATTTTACACCAAAGTCTTTTAACATTTCTGTTGAAATATCACCGGTAAATGCACCTTGTTTGTTTAAATCAACATTTTGAGTACCTAAAGCAATATCACTGCTGACCAGTGCAGTTTCAGCTTCCGCTAAATACATGACAGGCGGTGCAATTGCCACATCACAGCCTGACACCGCATTAAGTTCATCTTTTAAACCGGTAATAAGTTCTTTTGTAAAGGCTTTACTACCATTTAATTTCCAGTTACCCATGACTAAAGGACGACGAGCCAT\"\n", "\n", "# 获取最长的蛋白质序列\n", "protein_sequence = find_longest_orf(dna_sequence)\n", "print(\"最长的蛋白质序列:\", protein_sequence)\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "f42457cf-f946-46e5-9d31-b3c8947c0182", "metadata": {}, "outputs": [], "source": [ "#获得ORF完全匹配的\n", "not_match_list1 = []\n", "for item in not_match_list:\n", " example_dna = item[\"seq_a\"]\n", " example_protein = item[\"seq_b\"]\n", " label = item[\"label\"]\n", " \n", " protein_trans = find_longest_orf(example_dna)\n", "\n", " if 1==label:\n", " if example_protein.find(protein_trans[1:-2])==-1:#包含即可,前后可以相差几个字母\n", " not_match_list1.append(item)\n", " else:\n", " pos_select_list.append(item)" ] }, { "cell_type": "code", "execution_count": 14, "id": "4116c8f9-277d-41eb-bd4b-e66e20336ab5", "metadata": {}, "outputs": [], "source": [ "#ORF匹配较多的\n", "from Bio import Align\n", "\n", "# 创建PairwiseAligner对象\n", "aligner = Align.PairwiseAligner()\n", "\n", "# 设置比对模式为全局比对\n", "aligner.mode = \"global\"\n", "\n", "for item in not_match_list1:\n", " example_dna = item[\"seq_a\"]\n", " example_protein = item[\"seq_b\"]\n", " label = item[\"label\"]\n", " \n", " protein_trans = find_longest_orf(example_dna)\n", "\n", " \n", " if 1==label:\n", "\n", " alignments = aligner.align(example_protein, protein_trans)\n", " score = alignments[0].score\n", " protein_trans_len = len(protein_trans)\n", "\n", " sim_score = score/protein_trans_len\n", "\n", " \n", " if sim_score > 0.8:#匹配较高的\n", " pos_select_list.append(item)" ] }, { "cell_type": "code", "execution_count": 15, "id": "b7a303cc-1bfc-4fe4-9428-addca7b000ea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "17067 6309\n" ] } ], "source": [ "print(len(neg_select_list),len(pos_select_list))" ] }, { "cell_type": "code", "execution_count": 9, "id": "37ee3361-2fd6-441b-bb59-f355a95debae", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "target 0 MA--ES-REPR-GAVE----A----EL-DPVEYTLRK------R-----L------P-H-\n", " 0 |---|--|----||------|----|--|-------|------|-----|------|-|-\n", "query 0 M-WQE-AR---HGA--WHRRATSGSE-QD-------KKKFIGGRWSHTPLVQKTTVPVHG\n", "\n", "target 28 ---RLP--R-RPN-DVYVNMKTDFK-AQL------A----RCQKLLDCG-ARG-------\n", " 60 ---|-|--|-|---|------|----|--------|----|-----|---|-|-------\n", "query 45 HGER-PTSRGR--QD------T---TA--PRTPSSAGSVPR-----D--VA-GGPPRRLA\n", "\n", "target 62 Q-SACSEIYIHGLG-L----AIN----RA--INIA-LQLQAGS-F---GALQ--VAAN--\n", " 120 |-|-------|-|--|----|------|---|--|-|-|-----|---|-----|-----\n", "query 83 QKS-------H-L-WLGYWGA--TLKTR-MWI--AEL-L----RFRITG---SRV---SV\n", "\n", "target 101 ------TS-TVELVDELEPE--TDT-RE-PVI-------RN-----RNNS--AIHIRVF-\n", " 180 ------||-|||-|--|-----|---|--|---------|------|-----|---|---\n", "query 118 SGSSSSTSSTVE-V--L---AAT--CR-AP--KLPACSCR-AMLMAR---LMA---R--P\n", "\n", "target 135 RVAPQ-- 140\n", " 240 |--|--- 247\n", "query 158 R--P-WM 162\n", "\n", "比对得分: 55.0\n", "查询序列: MWQEARHGAWHRRATSGSEQDKKKFIGGRWSHTPLVQKTTVPVHGHGERPTSRGRQDTTAPRTPSSAGSVPRDVAGGPPRRLAQKSHLWLGYWGATLKTRMWIAELLRFRITGSRVSVSGSSSSTSSTVEVLAATCRAPKLPACSCRAMLMARLMARPRPWM\n", "目标序列: MAESREPRGAVEAELDPVEYTLRKRLPHRLPRRPNDVYVNMKTDFKAQLARCQKLLDCGARGQSACSEIYIHGLGLAINRAINIALQLQAGSFGALQVAANTSTVELVDELEPETDTREPVIRNRNNSAIHIRVFRVAPQ\n", "-----------------------------------------------------------\n", "\n" ] } ], "source": [ "from Bio import Align\n", "\n", "# 创建PairwiseAligner对象\n", "aligner = Align.PairwiseAligner()\n", "\n", "# 设置比对模式为全局比对\n", "aligner.mode = \"global\"\n", "\n", "\n", "# 示例蛋白质序列\n", "seq1 = \"MAESREPRGAVEAELDPVEYTLRKRLPHRLPRRPNDVYVNMKTDFKAQLARCQKLLDCGARGQSACSEIYIHGLGLAINRAINIALQLQAGSFGALQVAANTSTVELVDELEPETDTREPVIRNRNNSAIHIRVFRVAPQ\"\n", "seq2 = \"MWQEARHGAWHRRATSGSEQDKKKFIGGRWSHTPLVQKTTVPVHGHGERPTSRGRQDTTAPRTPSSAGSVPRDVAGGPPRRLAQKSHLWLGYWGATLKTRMWIAELLRFRITGSRVSVSGSSSSTSSTVEVLAATCRAPKLPACSCRAMLMARLMARPRPWM\"\n", "\n", "\n", "# 执行比对\n", "alignments = aligner.align(seq1, seq2)\n", "\n", "# 输出比对结果\n", "for alignment in alignments:\n", " print(alignment)\n", " print(f\"比对得分: {alignment.score}\")\n", " print(f\"查询序列: {alignment.query}\")\n", " print(f\"目标序列: {alignment.target}\")\n", " print(\"-----------------------------------------------------------\\n\")\n", " break" ] }, { "cell_type": "code", "execution_count": null, "id": "a3499a28-f01d-4e48-9c49-916763cde800", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }