File size: 10,133 Bytes
4707555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Title     : codon_table.py
project   : web
Created by: julse
Created on: 2025/7/9 20:40
des: TODO
"""
import re
import pandas as pd
# https://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=9606&aa=1&style=N
Homo_sapiens = """
UUU F 0.46 17.6 (714298)  UCU S 0.19 15.2 (618711)  UAU Y 0.44 12.2 (495699)  UGU C 0.46 10.6 (430311)
UUC F 0.54 20.3 (824692)  UCC S 0.22 17.7 (718892)  UAC Y 0.56 15.3 (622407)  UGC C 0.54 12.6 (513028)
UUA L 0.08  7.7 (311881)  UCA S 0.15 12.2 (496448)  UAA * 0.30  1.0 ( 40285)  UGA * 0.47  1.6 ( 63237)
UUG L 0.13 12.9 (525688)  UCG S 0.05  4.4 (179419)  UAG * 0.24  0.8 ( 32109)  UGG W 1.00 13.2 (535595)

CUU L 0.13 13.2 (536515)  CCU P 0.29 17.5 (713233)  CAU H 0.42 10.9 (441711)  CGU R 0.08  4.5 (184609)
CUC L 0.20 19.6 (796638)  CCC P 0.32 19.8 (804620)  CAC H 0.58 15.1 (613713)  CGC R 0.18 10.4 (423516)
CUA L 0.07  7.2 (290751)  CCA P 0.28 16.9 (688038)  CAA Q 0.27 12.3 (501911)  CGA R 0.11  6.2 (250760)
CUG L 0.40 39.6 (1611801)  CCG P 0.11  6.9 (281570)  CAG Q 0.73 34.2 (1391973)  CGG R 0.20 11.4 (464485)

AUU I 0.36 16.0 (650473)  ACU T 0.25 13.1 (533609)  AAU N 0.47 17.0 (689701)  AGU S 0.15 12.1 (493429)
AUC I 0.47 20.8 (846466)  ACC T 0.36 18.9 (768147)  AAC N 0.53 19.1 (776603)  AGC S 0.24 19.5 (791383)
AUA I 0.17  7.5 (304565)  ACA T 0.28 15.1 (614523)  AAA K 0.43 24.4 (993621)  AGA R 0.21 12.2 (494682)
AUG M 1.00 22.0 (896005)  ACG T 0.11  6.1 (246105)  AAG K 0.57 31.9 (1295568)  AGG R 0.21 12.0 (486463)

GUU V 0.18 11.0 (448607)  GCU A 0.27 18.4 (750096)  GAU D 0.46 21.8 (885429)  GGU G 0.16 10.8 (437126)
GUC V 0.24 14.5 (588138)  GCC A 0.40 27.7 (1127679)  GAC D 0.54 25.1 (1020595)  GGC G 0.34 22.2 (903565)
GUA V 0.12  7.1 (287712)  GCA A 0.23 15.8 (643471)  GAA E 0.42 29.0 (1177632)  GGA G 0.25 16.5 (669873)
GUG V 0.46 28.1 (1143534)  GCG A 0.11  7.4 (299495)  GAG E 0.58 39.6 (1609975)  GGG G 0.25 16.5 (669768)
"""

Mus_musculus = """
UUU F 0.44 17.2 (422153)  UCU S 0.20 16.2 (398250)  UAU Y 0.43 12.2 (298518)  UGU C 0.48 11.4 (279729)
UUC F 0.56 21.8 (535439)  UCC S 0.22 18.1 (444041)  UAC Y 0.57 16.1 (394074)  UGC C 0.52 12.3 (301384)
UUA L 0.07  6.7 (165150)  UCA S 0.14 11.8 (289799)  UAA * 0.28  1.0 ( 23403)  UGA * 0.49  1.6 ( 40148)
UUG L 0.13 13.4 (329668)  UCG S 0.05  4.2 (103815)  UAG * 0.23  0.8 ( 19126)  UGG W 1.00 12.5 (306619)

CUU L 0.13 13.4 (329757)  CCU P 0.31 18.4 (450637)  CAU H 0.41 10.6 (260637)  CGU R 0.08  4.7 (114854)
CUC L 0.20 20.2 (495018)  CCC P 0.30 18.2 (446868)  CAC H 0.59 15.3 (375626)  CGC R 0.17  9.4 (229758)
CUA L 0.08  8.1 (198032)  CCA P 0.29 17.3 (423707)  CAA Q 0.26 12.0 (293318)  CGA R 0.12  6.6 (161412)
CUG L 0.39 39.5 (969515)  CCG P 0.10  6.2 (151521)  CAG Q 0.74 34.1 (836320)  CGG R 0.19 10.2 (250836)

AUU I 0.34 15.4 (377698)  ACU T 0.25 13.7 (335039)  AAU N 0.43 15.6 (382284)  AGU S 0.15 12.7 (311331)
AUC I 0.50 22.5 (552184)  ACC T 0.35 19.0 (465115)  AAC N 0.57 20.3 (499149)  AGC S 0.24 19.7 (483013)
AUA I 0.16  7.4 (180467)  ACA T 0.29 16.0 (391437)  AAA K 0.39 21.9 (537723)  AGA R 0.22 12.1 (297135)
AUG M 1.00 22.8 (559953)  ACG T 0.10  5.6 (138180)  AAG K 0.61 33.6 (825270)  AGG R 0.22 12.2 (299472)

GUU V 0.17 10.7 (262535)  GCU A 0.29 20.0 (491093)  GAU D 0.45 21.0 (515049)  GGU G 0.18 11.4 (280522)
GUC V 0.25 15.4 (377902)  GCC A 0.38 26.0 (637878)  GAC D 0.55 26.0 (638504)  GGC G 0.33 21.2 (520069)
GUA V 0.12  7.4 (182733)  GCA A 0.23 15.8 (388723)  GAA E 0.41 27.0 (661498)  GGA G 0.26 16.8 (411344)
GUG V 0.46 28.4 (696158)  GCG A 0.09  6.4 (157124)  GAG E 0.59 39.4 (965963)  GGG G 0.23 15.2 (372099)
"""

# https://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=4922&aa=1&style=N
Pichia = """
UUU F 0.54 24.1 (  1963)  UCU S 0.29 24.4 (  1983)  UAU Y 0.47 16.0 (  1300)  UGU C 0.64  7.7 (   626)
UUC F 0.46 20.6 (  1675)  UCC S 0.20 16.5 (  1344)  UAC Y 0.53 18.1 (  1473)  UGC C 0.36  4.4 (   356)
UUA L 0.16 15.6 (  1265)  UCA S 0.18 15.2 (  1234)  UAA * 0.51  0.8 (    69)  UGA * 0.20  0.3 (    27)
UUG L 0.33 31.5 (  2562)  UCG S 0.09  7.4 (   598)  UAG * 0.29  0.5 (    40)  UGG W 1.00 10.3 (   834)

CUU L 0.16 15.9 (  1289)  CCU P 0.35 15.8 (  1282)  CAU H 0.57 11.8 (   960)  CGU R 0.17  6.9 (   564)
CUC L 0.08  7.6 (   620)  CCC P 0.15  6.8 (   553)  CAC H 0.43  9.1 (   737)  CGC R 0.05  2.2 (   175)
CUA L 0.11 10.7 (   873)  CCA P 0.42 18.9 (  1540)  CAA Q 0.61 25.4 (  2069)  CGA R 0.10  4.2 (   340)
CUG L 0.16 14.9 (  1215)  CCG P 0.09  3.9 (   320)  CAG Q 0.39 16.3 (  1323)  CGG R 0.05  1.9 (   158)

AUU I 0.50 31.1 (  2532)  ACU T 0.40 22.4 (  1820)  AAU N 0.48 25.1 (  2038)  AGU S 0.15 12.5 (  1020)
AUC I 0.31 19.4 (  1580)  ACC T 0.26 14.5 (  1175)  AAC N 0.52 26.7 (  2168)  AGC S 0.09  7.6 (   621)
AUA I 0.18 11.1 (   906)  ACA T 0.24 13.8 (  1118)  AAA K 0.47 29.9 (  2433)  AGA R 0.48 20.1 (  1634)
AUG M 1.00 18.7 (  1517)  ACG T 0.11  6.0 (   491)  AAG K 0.53 33.8 (  2748)  AGG R 0.16  6.6 (   539)

GUU V 0.42 26.9 (  2188)  GCU A 0.45 28.9 (  2351)  GAU D 0.58 35.7 (  2899)  GGU G 0.44 25.5 (  2075)
GUC V 0.23 14.9 (  1210)  GCC A 0.26 16.6 (  1348)  GAC D 0.42 25.9 (  2103)  GGC G 0.14  8.1 (   655)
GUA V 0.15  9.9 (   804)  GCA A 0.23 15.1 (  1228)  GAA E 0.56 37.4 (  3043)  GGA G 0.33 19.1 (  1550)
GUG V 0.19 12.3 (   998)  GCG A 0.06  3.9 (   314)  GAG E 0.44 29.0 (  2360)  GGG G 0.10  5.8 (   468)
"""

# https://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=316407&aa=1&style=N
Escherichia_coli = """
UUU F 0.57 22.2 ( 30462)  UCU S 0.15  8.4 ( 11512)  UAU Y 0.57 16.1 ( 22037)  UGU C 0.44  5.1 (  7016)
UUC F 0.43 16.5 ( 22705)  UCC S 0.15  8.6 ( 11802)  UAC Y 0.43 12.2 ( 16795)  UGC C 0.56  6.4 (  8797)
UUA L 0.13 13.8 ( 18894)  UCA S 0.12  7.0 (  9620)  UAA * 0.64  2.0 (  2765)  UGA * 0.29  0.9 (  1249)
UUG L 0.13 13.6 ( 18664)  UCG S 0.15  8.9 ( 12210)  UAG * 0.07  0.2 (   321)  UGG W 1.00 15.2 ( 20889)

CUU L 0.10 11.0 ( 15082)  CCU P 0.16  7.0 (  9540)  CAU H 0.57 13.0 ( 17791)  CGU R 0.38 21.0 ( 28866)
CUC L 0.10 11.1 ( 15272)  CCC P 0.12  5.5 (  7490)  CAC H 0.43  9.8 ( 13399)  CGC R 0.40 22.3 ( 30530)
CUA L 0.04  3.8 (  5266)  CCA P 0.19  8.4 ( 11569)  CAA Q 0.35 15.4 ( 21121)  CGA R 0.06  3.5 (  4810)
CUG L 0.50 53.1 ( 72898)  CCG P 0.53 23.4 ( 32080)  CAG Q 0.65 29.0 ( 39835)  CGG R 0.10  5.4 (  7401)

AUU I 0.51 30.4 ( 41644)  ACU T 0.16  8.8 ( 12119)  AAU N 0.45 17.6 ( 24106)  AGU S 0.15  8.7 ( 11924)
AUC I 0.42 25.2 ( 34568)  ACC T 0.44 23.5 ( 32265)  AAC N 0.55 21.6 ( 29581)  AGC S 0.28 16.1 ( 22067)
AUA I 0.07  4.2 (  5733)  ACA T 0.13  6.9 (  9452)  AAA K 0.76 33.6 ( 46116)  AGA R 0.04  2.0 (  2771)
AUG M 1.00 27.8 ( 38167)  ACG T 0.27 14.4 ( 19820)  AAG K 0.24 10.3 ( 14174)  AGG R 0.02  1.1 (  1496)

GUU V 0.26 18.2 ( 24991)  GCU A 0.16 15.2 ( 20813)  GAU D 0.63 32.2 ( 44217)  GGU G 0.34 24.7 ( 33875)
GUC V 0.22 15.3 ( 21050)  GCC A 0.27 25.7 ( 35252)  GAC D 0.37 19.1 ( 26270)  GGC G 0.41 29.8 ( 40849)
GUA V 0.15 10.9 ( 14901)  GCA A 0.21 20.1 ( 27567)  GAA E 0.69 39.7 ( 54431)  GGA G 0.11  7.9 ( 10774)
GUG V 0.37 26.3 ( 36108)  GCG A 0.36 33.9 ( 46524)  GAG E 0.31 18.0 ( 24629)  GGG G 0.15 11.0 ( 15115)
"""
# https://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=4932&aa=1&style=N
Saccharomyces_cerevisiae = """
UUU F 0.59 26.1 (170666)  UCU S 0.26 23.5 (153557)  UAU Y 0.56 18.8 (122728)  UGU C 0.63  8.1 ( 52903)
UUC F 0.41 18.4 (120510)  UCC S 0.16 14.2 ( 92923)  UAC Y 0.44 14.8 ( 96596)  UGC C 0.37  4.8 ( 31095)
UUA L 0.28 26.2 (170884)  UCA S 0.21 18.7 (122028)  UAA * 0.47  1.1 (  6913)  UGA * 0.30  0.7 (  4447)
UUG L 0.29 27.2 (177573)  UCG S 0.10  8.6 ( 55951)  UAG * 0.23  0.5 (  3312)  UGG W 1.00 10.4 ( 67789)

CUU L 0.13 12.3 ( 80076)  CCU P 0.31 13.5 ( 88263)  CAU H 0.64 13.6 ( 89007)  CGU R 0.14  6.4 ( 41791)
CUC L 0.06  5.4 ( 35545)  CCC P 0.15  6.8 ( 44309)  CAC H 0.36  7.8 ( 50785)  CGC R 0.06  2.6 ( 16993)
CUA L 0.14 13.4 ( 87619)  CCA P 0.42 18.3 (119641)  CAA Q 0.69 27.3 (178251)  CGA R 0.07  3.0 ( 19562)
CUG L 0.11 10.5 ( 68494)  CCG P 0.12  5.3 ( 34597)  CAG Q 0.31 12.1 ( 79121)  CGG R 0.04  1.7 ( 11351)

AUU I 0.46 30.1 (196893)  ACU T 0.35 20.3 (132522)  AAU N 0.59 35.7 (233124)  AGU S 0.16 14.2 ( 92466)
AUC I 0.26 17.2 (112176)  ACC T 0.22 12.7 ( 83207)  AAC N 0.41 24.8 (162199)  AGC S 0.11  9.8 ( 63726)
AUA I 0.27 17.8 (116254)  ACA T 0.30 17.8 (116084)  AAA K 0.58 41.9 (273618)  AGA R 0.48 21.3 (139081)
AUG M 1.00 20.9 (136805)  ACG T 0.14  8.0 ( 52045)  AAG K 0.42 30.8 (201361)  AGG R 0.21  9.2 ( 60289)

GUU V 0.39 22.1 (144243)  GCU A 0.38 21.2 (138358)  GAU D 0.65 37.6 (245641)  GGU G 0.47 23.9 (156109)
GUC V 0.21 11.8 ( 76947)  GCC A 0.22 12.6 ( 82357)  GAC D 0.35 20.2 (132048)  GGC G 0.19  9.8 ( 63903)
GUA V 0.21 11.8 ( 76927)  GCA A 0.29 16.2 (105910)  GAA E 0.70 45.6 (297944)  GGA G 0.22 10.9 ( 71216)
GUG V 0.19 10.8 ( 70337)  GCG A 0.11  6.2 ( 40358)  GAG E 0.30 19.2 (125717)  GGG G 0.12  6.0 ( 39359)
"""



species_dict = dict(zip(['Homo_sapiens', 'Mus_musculus', 'Pichia', 'Escherichia_coli','Saccharomyces_cerevisiae'],[Homo_sapiens, Mus_musculus, Pichia, Escherichia_coli,Saccharomyces_cerevisiae]))

def parse_text(data,pattern_txt = None):
    # Regular expression to match each codon entry
    if pattern_txt:
        pattern = re.compile(pattern_txt)
    else:
        pattern = re.compile(r'([A-Z]{3})\s+([A-Z\*])\s+([\d\.]+)\s+([\d\.]+)\s+\(([\d\s]+)\)')

    results = []
    for line in data.split('\n'):
        if not line.strip():
            continue
        # Find all matches in the line
        matches = pattern.finditer(line)
        for match in matches:
            triplet = match.group(1)
            amino_acid = match.group(2)
            fraction = match.group(3)
            frequency = match.group(4)
            number = match.group(5).replace(' ', '')  # Remove any spaces in the number
            if triplet == '': continue  # Skip empty entries
            results.append([triplet, amino_acid, fraction, frequency, number])

    # Convert to a pandas DataFrame for nice display (optional)

    df = pd.DataFrame(results, columns=['triplet', 'amino_acid', 'fraction', 'frequency', 'number'])
    return df