tonigi commited on
Commit
637c5bb
·
1 Parent(s): 548faf1
Files changed (1) hide show
  1. app.py +143 -50
app.py CHANGED
@@ -23,40 +23,52 @@ def get_uniprot_data(uniprot_id):
23
  url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
24
  response = urlopen(url).read().decode('utf-8')
25
 
26
- # Parse XML
27
  root = ET.fromstring(response)
 
28
 
29
  # Get sequence
30
- sequence_elem = root.find(".//{http://uniprot.org/uniprot}sequence")
31
  if sequence_elem is None:
32
  return None, None, "Could not find sequence in UniProt response"
33
  protein_sequence = sequence_elem.text.strip()
34
 
35
  # Get feature annotations
36
  annotations = {}
37
- for feature in root.findall(".//{http://uniprot.org/uniprot}feature"):
38
  feature_type = feature.get('type')
39
  description = feature.get('description', '')
40
 
41
  # Get position information
42
- location = feature.find("{http://uniprot.org/uniprot}location")
43
  if location is None:
44
  continue
45
-
46
- position = location.find("{http://uniprot.org/uniprot}position")
 
 
 
 
47
  if position is not None:
48
- start = end = int(position.get('position'))
49
- else:
50
- begin = location.find("{http://uniprot.org/uniprot}begin")
51
- end_elem = location.find("{http://uniprot.org/uniprot}end")
52
- if begin is None or end_elem is None:
53
- continue
 
 
 
54
  start = int(begin.get('position'))
55
  end = int(end_elem.get('position'))
56
-
57
- if feature_type not in annotations:
58
- annotations[feature_type] = []
59
- annotations[feature_type].append((start, end, description))
 
 
 
 
60
 
61
  return protein_sequence, annotations, None
62
 
@@ -97,53 +109,133 @@ def create_dataframe(protein_sequence, annotations):
97
  'helix': 'Secondary structure',
98
  'turn': 'Secondary structure',
99
  'domain': 'Domain',
100
- 'region': ['Pfam domain', 'Disorder'],
101
  'disulfide bond': 'Disulfide bridges',
102
  'glycosylation site': 'Glycosylation sites',
103
  'modified residue': 'modified',
104
  'active site': 'active sites',
105
- 'binding site': ['metal binding sites', 'DNA binding sites', 'RNA binding sites', 'ligand binding sites'],
106
  'site': 'Phosphorylation sites'
107
  }
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  for feature_type, values in annotations.items():
110
- feature_type = feature_type.lower() # Convert to lowercase for matching
111
- for start, end, description in values:
112
- # Get the corresponding column(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  column = feature_mapping.get(feature_type)
114
  if not column:
115
  continue
116
 
117
- # Handle cases where one feature type maps to multiple possible columns
118
- if isinstance(column, list):
119
- if feature_type == 'region':
120
- if 'Pfam' in description:
121
- column = 'Pfam domain'
122
- elif 'disorder' in description.lower():
123
- column = 'Disorder'
124
- else:
 
 
 
125
  continue
126
- elif feature_type == 'binding site':
127
- if 'metal' in description.lower():
128
- column = 'metal binding sites'
129
- elif 'DNA' in description:
130
- column = 'DNA binding sites'
131
- elif 'RNA' in description:
132
- column = 'RNA binding sites'
133
- else:
134
- column = 'ligand binding sites'
135
-
136
- # Fill in the annotation
137
- for i in range(start - 1, end):
138
- if i < len(df):
139
  if column == 'Secondary structure':
140
- df.loc[i, column] = feature_type.upper() # Use uppercase for secondary structure
141
  else:
142
- current_value = df.loc[i, column]
143
- if current_value:
144
- df.loc[i, column] = f"{current_value}; {description}"
145
- else:
146
- df.loc[i, column] = description
 
147
 
148
  return df
149
 
@@ -170,7 +262,7 @@ def process_uniprot_id(uniprot_id):
170
 
171
 
172
  # Gradio Interface
173
- iface = gr.Interface(
174
  fn=process_uniprot_id,
175
  inputs=gr.Textbox(label="UniProt ID", placeholder="e.g., P04637"),
176
  outputs=gr.Dataframe(label="Protein Sequence and Annotations"),
@@ -178,4 +270,5 @@ iface = gr.Interface(
178
  description="Enter a UniProt ID to view the protein sequence and its annotations in a DataFrame."
179
  )
180
 
181
- iface.launch()
 
 
23
  url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
24
  response = urlopen(url).read().decode('utf-8')
25
 
26
+ # Parse XML with namespace
27
  root = ET.fromstring(response)
28
+ ns = {'up': 'http://uniprot.org/uniprot'}
29
 
30
  # Get sequence
31
+ sequence_elem = root.find(".//up:sequence", ns)
32
  if sequence_elem is None:
33
  return None, None, "Could not find sequence in UniProt response"
34
  protein_sequence = sequence_elem.text.strip()
35
 
36
  # Get feature annotations
37
  annotations = {}
38
+ for feature in root.findall(".//up:feature", ns):
39
  feature_type = feature.get('type')
40
  description = feature.get('description', '')
41
 
42
  # Get position information
43
+ location = feature.find("up:location", ns)
44
  if location is None:
45
  continue
46
+
47
+ # Handle different types of position elements
48
+ position = location.find("up:position", ns)
49
+ begin = location.find("up:begin", ns)
50
+ end_elem = location.find("up:end", ns)
51
+
52
  if position is not None:
53
+ pos = int(position.get('position'))
54
+ # For single position features
55
+ if feature_type not in annotations:
56
+ annotations[feature_type] = []
57
+ annotations[feature_type].append({
58
+ 'position': pos,
59
+ 'description': description
60
+ })
61
+ elif begin is not None and end_elem is not None:
62
  start = int(begin.get('position'))
63
  end = int(end_elem.get('position'))
64
+ # For range features and disulfide bonds
65
+ if feature_type not in annotations:
66
+ annotations[feature_type] = []
67
+ annotations[feature_type].append({
68
+ 'begin': start,
69
+ 'end': end,
70
+ 'description': description
71
+ })
72
 
73
  return protein_sequence, annotations, None
74
 
 
109
  'helix': 'Secondary structure',
110
  'turn': 'Secondary structure',
111
  'domain': 'Domain',
 
112
  'disulfide bond': 'Disulfide bridges',
113
  'glycosylation site': 'Glycosylation sites',
114
  'modified residue': 'modified',
115
  'active site': 'active sites',
 
116
  'site': 'Phosphorylation sites'
117
  }
118
 
119
+ # Special mappings that need additional processing
120
+ region_mapping = {
121
+ 'pfam': 'Pfam domain',
122
+ 'disorder': 'Disorder'
123
+ }
124
+
125
+ binding_mapping = {
126
+ 'metal': 'metal binding sites',
127
+ 'dna': 'DNA binding sites',
128
+ 'rna': 'RNA binding sites',
129
+ 'ligand': 'ligand binding sites'
130
+ }
131
+
132
  for feature_type, values in annotations.items():
133
+ feature_type = feature_type.lower()
134
+
135
+ # Handle disulfide bond pairs
136
+ if feature_type == 'disulfide bond':
137
+ for item in values:
138
+ start = item['begin']
139
+ end = item['end']
140
+ desc = f"Disulfide bridge with Cys-{end}"
141
+ df.at[start-1, 'Disulfide bridges'] = desc
142
+ desc = f"Disulfide bridge with Cys-{start}"
143
+ df.at[end-1, 'Disulfide bridges'] = desc
144
+
145
+ # Handle glycosylation sites
146
+ elif feature_type == 'glycosylation site':
147
+ for item in values:
148
+ pos = item['position'] - 1
149
+ df.at[pos, 'Glycosylation sites'] = item['description']
150
+
151
+ # Handle region features
152
+ elif feature_type == 'region':
153
+ for item in values:
154
+ start = item.get('begin', item.get('position'))
155
+ end = item.get('end', item.get('position'))
156
+ if not start:
157
+ continue
158
+
159
+ start = int(start)
160
+ end = int(end) if end else start
161
+ desc = item['description'].lower()
162
+
163
+ # Map to appropriate column based on description
164
+ column = None
165
+ if 'pfam' in desc:
166
+ column = 'Pfam domain'
167
+ elif 'disorder' in desc:
168
+ column = 'Disorder'
169
+
170
+ if column:
171
+ for i in range(start - 1, end):
172
+ if i >= len(df):
173
+ continue
174
+ current = df.at[i, column]
175
+ if isinstance(current, str) and current != "" and desc:
176
+ df.at[i, column] = f"{current}; {desc}"
177
+ elif desc:
178
+ df.at[i, column] = desc
179
+
180
+ # Handle binding site features
181
+ elif feature_type == 'binding site':
182
+ for item in values:
183
+ start = item.get('begin', item.get('position'))
184
+ end = item.get('end', item.get('position'))
185
+ if not start:
186
+ continue
187
+
188
+ start = int(start)
189
+ end = int(end) if end else start
190
+ desc = item['description'].lower()
191
+
192
+ # Map to appropriate column based on description
193
+ column = None
194
+ if 'metal' in desc:
195
+ column = 'metal binding sites'
196
+ elif 'dna' in desc:
197
+ column = 'DNA binding sites'
198
+ elif 'rna' in desc:
199
+ column = 'RNA binding sites'
200
+ else:
201
+ column = 'ligand binding sites'
202
+
203
+ for i in range(start - 1, end):
204
+ if i >= len(df):
205
+ continue
206
+ current = df.at[i, column]
207
+ if isinstance(current, str) and current != "" and desc:
208
+ df.at[i, column] = f"{current}; {desc}"
209
+ elif desc:
210
+ df.at[i, column] = desc
211
+
212
+ # Handle other features
213
+ else:
214
  column = feature_mapping.get(feature_type)
215
  if not column:
216
  continue
217
 
218
+ for item in values:
219
+ start = item.get('begin', item.get('position'))
220
+ end = item.get('end', item.get('position'))
221
+ if not start:
222
+ continue
223
+
224
+ start = int(start)
225
+ end = int(end) if end else start
226
+
227
+ for i in range(start - 1, end):
228
+ if i >= len(df):
229
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  if column == 'Secondary structure':
231
+ df.at[i, column] = feature_type.upper()
232
  else:
233
+ current = df.at[i, column]
234
+ desc = item['description']
235
+ if isinstance(current, str) and current != "" and desc:
236
+ df.at[i, column] = f"{current}; {desc}"
237
+ elif desc:
238
+ df.at[i, column] = desc
239
 
240
  return df
241
 
 
262
 
263
 
264
  # Gradio Interface
265
+ demo = gr.Interface(
266
  fn=process_uniprot_id,
267
  inputs=gr.Textbox(label="UniProt ID", placeholder="e.g., P04637"),
268
  outputs=gr.Dataframe(label="Protein Sequence and Annotations"),
 
270
  description="Enter a UniProt ID to view the protein sequence and its annotations in a DataFrame."
271
  )
272
 
273
+ if __name__ == "__main__":
274
+ demo.launch()