tonigi commited on
Commit
489bd70
·
1 Parent(s): c1300a3
Files changed (3) hide show
  1. .gitignore +1 -0
  2. README.txt +19 -0
  3. app.py +179 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ env/
README.txt CHANGED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Create a Gradio app which requests the uniprot ID of a protein and returns the protein sequence as a data frame.
2
+ Use the XML file returned. Parse the annotations and returns the following columns in the dataframe:
3
+
4
+ - Residue Number
5
+ - Residue code (1 letter) for the wild type residue
6
+ - Secondary structure (taken from the annotations)
7
+ - Pfam domain
8
+ - Disorder
9
+ - Disulfilde bridges
10
+ - Glycosylation sites
11
+ - Phosphorylation sites
12
+ - active sites
13
+ - metal binding sites
14
+ - DNA binding sites
15
+ - RNA binding sites
16
+ - ligand binding sites
17
+ - modified
18
+
19
+
app.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from io import StringIO
4
+ from urllib.request import urlopen
5
+ import re
6
+ import xml.etree.ElementTree as ET
7
+
8
+ def get_uniprot_data(uniprot_id):
9
+ """
10
+ Fetches protein sequence and annotation data from UniProt in XML format.
11
+
12
+ Args:
13
+ uniprot_id: The UniProt ID of the protein.
14
+
15
+ Returns:
16
+ A tuple containing:
17
+ - protein_sequence: The protein sequence as a string.
18
+ - annotations: A dictionary containing annotations.
19
+ - error_message: An error message if something goes wrong, otherwise None
20
+ """
21
+ try:
22
+ # Fetch XML data
23
+ url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
24
+ response = urlopen(url).read().decode('utf-8')
25
+
26
+ # Parse XML
27
+ root = ET.fromstring(response)
28
+
29
+ # Get sequence
30
+ sequence_elem = root.find(".//{http://uniprot.org/uniprot}sequence")
31
+ if sequence_elem is None:
32
+ return None, None, "Could not find sequence in UniProt response"
33
+ protein_sequence = sequence_elem.text.strip()
34
+
35
+ # Get feature annotations
36
+ annotations = {}
37
+ for feature in root.findall(".//{http://uniprot.org/uniprot}feature"):
38
+ feature_type = feature.get('type')
39
+ description = feature.get('description', '')
40
+
41
+ # Get position information
42
+ location = feature.find("{http://uniprot.org/uniprot}location")
43
+ if location is None:
44
+ continue
45
+
46
+ position = location.find("{http://uniprot.org/uniprot}position")
47
+ if position is not None:
48
+ start = end = int(position.get('position'))
49
+ else:
50
+ begin = location.find("{http://uniprot.org/uniprot}begin")
51
+ end_elem = location.find("{http://uniprot.org/uniprot}end")
52
+ if begin is None or end_elem is None:
53
+ continue
54
+ start = int(begin.get('position'))
55
+ end = int(end_elem.get('position'))
56
+
57
+ if feature_type not in annotations:
58
+ annotations[feature_type] = []
59
+ annotations[feature_type].append((start, end, description))
60
+
61
+ return protein_sequence, annotations, None
62
+
63
+ except Exception as e:
64
+ return None, None, f"Error fetching or processing data from UniProt: {e}"
65
+
66
+ def create_dataframe(protein_sequence, annotations):
67
+ """
68
+ Creates a Pandas DataFrame from protein sequence and annotations.
69
+ """
70
+ data = []
71
+ for i, residue in enumerate(protein_sequence):
72
+ residue_number = i + 1
73
+ row = {
74
+ "Residue Number": residue_number,
75
+ "Residue code": residue,
76
+ "Secondary structure": "",
77
+ "Domain": "",
78
+ "Pfam domain": "",
79
+ "Disorder": "",
80
+ "Disulfide bridges": "",
81
+ "Glycosylation sites": "",
82
+ "Phosphorylation sites": "",
83
+ "active sites": "",
84
+ "metal binding sites": "",
85
+ "DNA binding sites": "",
86
+ "RNA binding sites": "",
87
+ "ligand binding sites": "",
88
+ "modified": ""
89
+ }
90
+ data.append(row)
91
+
92
+ df = pd.DataFrame(data)
93
+
94
+ # Map UniProt feature types to our column names
95
+ feature_mapping = {
96
+ 'helix': 'Secondary structure',
97
+ 'strand': 'Secondary structure',
98
+ 'turn': 'Secondary structure',
99
+ 'domain': 'Domain',
100
+ 'region': ['Pfam domain', 'Disorder'], # Will check description
101
+ 'disulfide bond': 'Disulfide bridges',
102
+ 'glycosylation site': 'Glycosylation sites',
103
+ 'modified residue': 'modified',
104
+ 'active site': 'active sites',
105
+ 'binding site': ['metal binding sites', 'DNA binding sites', 'RNA binding sites', 'ligand binding sites'], # Will check description
106
+ 'site': 'Phosphorylation sites' # Will check description for phosphorylation
107
+ }
108
+
109
+ for feature_type, values in annotations.items():
110
+ for start, end, description in values:
111
+ feature_type_lower = feature_type.lower()
112
+
113
+ # Get the corresponding column(s)
114
+ column = feature_mapping.get(feature_type_lower)
115
+ if not column:
116
+ continue
117
+
118
+ # Handle cases where one feature type maps to multiple possible columns
119
+ if isinstance(column, list):
120
+ if feature_type_lower == 'region':
121
+ if 'Pfam' in description:
122
+ column = 'Pfam domain'
123
+ elif 'disorder' in description.lower():
124
+ column = 'Disorder'
125
+ else:
126
+ continue
127
+ elif feature_type_lower == 'binding site':
128
+ if 'metal' in description.lower():
129
+ column = 'metal binding sites'
130
+ elif 'DNA' in description:
131
+ column = 'DNA binding sites'
132
+ elif 'RNA' in description:
133
+ column = 'RNA binding sites'
134
+ else:
135
+ column = 'ligand binding sites'
136
+
137
+ # Fill in the annotation
138
+ for i in range(start - 1, end):
139
+ if i < len(df):
140
+ current_value = df.loc[i, column]
141
+ if current_value:
142
+ df.loc[i, column] = f"{current_value}; {description}"
143
+ else:
144
+ df.loc[i, column] = description
145
+
146
+ return df
147
+
148
+ def process_uniprot_id(uniprot_id):
149
+ """
150
+ Main function to process a UniProt ID.
151
+
152
+ Args:
153
+ uniprot_id: The UniProt ID.
154
+
155
+ Returns:
156
+ A Pandas DataFrame or an error message.
157
+ """
158
+ protein_sequence, annotations, error_message = get_uniprot_data(uniprot_id)
159
+
160
+ if error_message:
161
+ return error_message
162
+
163
+ if protein_sequence and annotations:
164
+ df = create_dataframe(protein_sequence, annotations)
165
+ return df
166
+ else:
167
+ return "Could not retrieve or process data for the given Uniprot ID"
168
+
169
+
170
+ # Gradio Interface
171
+ iface = gr.Interface(
172
+ fn=process_uniprot_id,
173
+ inputs=gr.Textbox(label="UniProt ID", placeholder="e.g., P04637"),
174
+ outputs=gr.Dataframe(label="Protein Sequence and Annotations"),
175
+ title="UniProt Protein Sequence and Annotation Viewer",
176
+ description="Enter a UniProt ID to view the protein sequence and its annotations in a DataFrame."
177
+ )
178
+
179
+ iface.launch()