tonigi commited on
Commit
cb99418
·
1 Parent(s): 74f6990
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +4 -214
  3. uniprot_data.py +216 -0
.gitignore CHANGED
@@ -1 +1,3 @@
1
  env/
 
 
 
1
  env/
2
+ __pycache__/
3
+ *.pyc
app.py CHANGED
@@ -1,220 +1,7 @@
1
  import gradio as gr
2
- import pandas as pd
3
  from io import StringIO
4
- from urllib.request import urlopen
5
- import re
6
- import xml.etree.ElementTree as ET
7
 
8
-
9
- def get_uniprot_data(uniprot_id):
10
- """
11
- Fetches protein sequence and annotation data from UniProt in XML format.
12
-
13
- Args:
14
- uniprot_id: The UniProt ID of the protein.
15
-
16
- Returns:
17
- A tuple containing:
18
- - protein_sequence: The protein sequence as a string.
19
- - annotations: A dictionary containing annotations.
20
- - error_message: An error message if something goes wrong, otherwise None
21
- """
22
- # Fetch XML data
23
- local_file_path = os.path.join("test", f"{uniprot_id}.xml")
24
- if os.path.exists(local_file_path):
25
- with open(local_file_path, "r", encoding="utf-8") as file:
26
- response = file.read()
27
- else:
28
- # Fetch XML data from UniProt
29
- url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
30
- response = urlopen(url).read().decode("utf-8")
31
-
32
- # Parse XML with namespace
33
- root = ET.fromstring(response)
34
- ns = {"up": "http://uniprot.org/uniprot"}
35
-
36
- # Get sequence
37
- sequence_elem = root.find("./up:entry/up:sequence", ns)
38
- if sequence_elem is None:
39
- return None, None, "Could not find sequence in UniProt response"
40
- protein_sequence = sequence_elem.text.strip()
41
-
42
- # Get feature annotations
43
- annotations = {}
44
- for feature in root.findall(".//up:feature", ns):
45
- feature_type = feature.get("type")
46
- description = feature.get("description", "")
47
-
48
- # Get position information
49
- location = feature.find("up:location", ns)
50
- if location is None:
51
- continue
52
-
53
- # Handle different types of position elements
54
- position = location.find("up:position", ns)
55
- begin = location.find("up:begin", ns)
56
- end_elem = location.find("up:end", ns)
57
-
58
- if position is not None:
59
- pos = int(position.get("position"))
60
- # For single position features
61
- if feature_type not in annotations:
62
- annotations[feature_type] = []
63
- annotations[feature_type].append(
64
- {"position": pos, "description": description}
65
- )
66
- elif begin is not None and end_elem is not None:
67
- start = int(begin.get("position"))
68
- end = int(end_elem.get("position"))
69
- # For range features and disulfide bonds
70
- if feature_type not in annotations:
71
- annotations[feature_type] = []
72
- annotations[feature_type].append(
73
- {"begin": start, "end": end, "description": description}
74
- )
75
-
76
- return protein_sequence, annotations
77
-
78
-
79
- def create_dataframe(protein_sequence, annotations):
80
- """
81
- Creates a Pandas DataFrame from protein sequence and annotations.
82
- """
83
- data = []
84
- for i, residue in enumerate(protein_sequence):
85
- residue_number = i + 1
86
- row = {
87
- "Residue Number": residue_number,
88
- "Residue code": residue,
89
- "Secondary structure": "",
90
- "Domain": "",
91
- "Pfam domain": "",
92
- "Disorder": "",
93
- "Disulfide bridges": "",
94
- "Glycosylation sites": "",
95
- "Phosphorylation sites": "",
96
- "active sites": "",
97
- "Binding sites": "", # Combined binding sites column
98
- "modified": "",
99
- }
100
- data.append(row)
101
-
102
- df = pd.DataFrame(data)
103
-
104
- # Map UniProt feature types to our column names
105
- feature_mapping = {
106
- "strand": "Secondary structure",
107
- "helix": "Secondary structure",
108
- "turn": "Secondary structure",
109
- "domain": "Domain",
110
- "disulfide bond": "Disulfide bridges",
111
- "glycosylation site": "Glycosylation sites",
112
- "modified residue": "modified",
113
- "active site": "active sites",
114
- "site": "Phosphorylation sites",
115
- }
116
-
117
- # Special mappings that need additional processing
118
- region_mapping = {"pfam": "Pfam domain", "disorder": "Disorder"}
119
-
120
- for feature_type, values in annotations.items():
121
- feature_type = feature_type.lower()
122
-
123
- # Handle disulfide bond pairs
124
- if feature_type == "disulfide bond":
125
- for item in values:
126
- start = item["begin"]
127
- end = item["end"]
128
- desc = f"Cys-{end}"
129
- df.at[start - 1, "Disulfide bridges"] = desc
130
- desc = f"Cys-{start}"
131
- df.at[end - 1, "Disulfide bridges"] = desc
132
-
133
- # Handle glycosylation sites
134
- elif feature_type == "glycosylation site":
135
- for item in values:
136
- pos = item["position"] - 1
137
- df.at[pos, "Glycosylation sites"] = item["description"]
138
-
139
- # Handle region features
140
- elif feature_type == "region":
141
- for item in values:
142
- start = item.get("begin", item.get("position"))
143
- end = item.get("end", item.get("position"))
144
- if not start:
145
- continue
146
-
147
- start = int(start)
148
- end = int(end) if end else start
149
- desc = item["description"].lower()
150
-
151
- # Map to appropriate column based on description
152
- column = None
153
- if "pfam" in desc:
154
- column = "Pfam domain"
155
- elif "disorder" in desc:
156
- column = "Disorder"
157
-
158
- if column:
159
- for i in range(start - 1, end):
160
- if i >= len(df):
161
- continue
162
- current = df.at[i, column]
163
- if isinstance(current, str) and current != "" and desc:
164
- df.at[i, column] = f"{current}; {desc}"
165
- elif desc:
166
- df.at[i, column] = desc
167
-
168
- # Handle binding site features
169
- elif feature_type == "binding site":
170
- for item in values:
171
- start = item.get("begin", item.get("position"))
172
- end = item.get("end", item.get("position"))
173
- if not start:
174
- continue
175
-
176
- start = int(start)
177
- end = int(end) if end else start
178
- desc = item["description"]
179
-
180
- for i in range(start - 1, end):
181
- if i >= len(df):
182
- continue
183
- current = df.at[i, "Binding sites"]
184
- if isinstance(current, str) and current != "" and desc:
185
- df.at[i, "Binding sites"] = f"{current}; {desc}"
186
- elif desc:
187
- df.at[i, "Binding sites"] = desc
188
-
189
- # Handle other features
190
- else:
191
- column = feature_mapping.get(feature_type)
192
- if not column:
193
- continue
194
-
195
- for item in values:
196
- start = item.get("begin", item.get("position"))
197
- end = item.get("end", item.get("position"))
198
- if not start:
199
- continue
200
-
201
- start = int(start)
202
- end = int(end) if end else start
203
-
204
- for i in range(start - 1, end):
205
- if i >= len(df):
206
- continue
207
- if column == "Secondary structure":
208
- df.at[i, column] = feature_type.upper()
209
- else:
210
- current = df.at[i, column]
211
- desc = item["description"]
212
- if isinstance(current, str) and current != "" and desc:
213
- df.at[i, column] = f"{current}; {desc}"
214
- elif desc:
215
- df.at[i, column] = desc
216
-
217
- return df
218
 
219
 
220
  def process_uniprot_id(uniprot_id):
@@ -240,6 +27,9 @@ def process_uniprot_id(uniprot_id):
240
  with gr.Blocks() as demo:
241
  with gr.Column():
242
  gr.Markdown("# Protein Sequence Analysis")
 
 
 
243
  input_text = gr.Textbox(
244
  label="UniProt ID",
245
  placeholder="Enter UniProt ID (e.g., P53_HUMAN)",
 
1
  import gradio as gr
 
2
  from io import StringIO
 
 
 
3
 
4
+ from uniprot_data import create_dataframe, get_uniprot_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  def process_uniprot_id(uniprot_id):
 
27
  with gr.Blocks() as demo:
28
  with gr.Column():
29
  gr.Markdown("# Protein Sequence Analysis")
30
+ gr.Markdown(
31
+ "This app fetches protein sequence and annotation data from UniProt using a UniProt ID and prints a copy-pasteable table for note-taking. **DO NOT TRUST, THIS IS A CODING EXPERIMENT**"
32
+ )
33
  input_text = gr.Textbox(
34
  label="UniProt ID",
35
  placeholder="Enter UniProt ID (e.g., P53_HUMAN)",
uniprot_data.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import xml.etree.ElementTree as ET
3
+ from urllib.request import urlopen
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def get_uniprot_data(uniprot_id):
9
+ """
10
+ Fetches protein sequence and annotation data from UniProt in XML format.
11
+
12
+ Args:
13
+ uniprot_id: The UniProt ID of the protein.
14
+
15
+ Returns:
16
+ A tuple containing:
17
+ - protein_sequence: The protein sequence as a string.
18
+ - annotations: A dictionary containing annotations.
19
+ - error_message: An error message if something goes wrong, otherwise None
20
+ """
21
+ # Fetch XML data
22
+ local_file_path = os.path.join("test", f"{uniprot_id}.xml")
23
+ if os.path.exists(local_file_path):
24
+ with open(local_file_path, "r", encoding="utf-8") as file:
25
+ response = file.read()
26
+ else:
27
+ # Fetch XML data from UniProt
28
+ url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
29
+ response = urlopen(url).read().decode("utf-8")
30
+
31
+ # Parse XML with namespace
32
+ root = ET.fromstring(response)
33
+ ns = {"up": "http://uniprot.org/uniprot"}
34
+
35
+ # Get sequence
36
+ sequence_elem = root.find("./up:entry/up:sequence", ns)
37
+ if sequence_elem is None:
38
+ return None, None, "Could not find sequence in UniProt response"
39
+ protein_sequence = sequence_elem.text.strip()
40
+
41
+ # Get feature annotations
42
+ annotations = {}
43
+ for feature in root.findall(".//up:feature", ns):
44
+ feature_type = feature.get("type")
45
+ description = feature.get("description", "")
46
+
47
+ # Get position information
48
+ location = feature.find("up:location", ns)
49
+ if location is None:
50
+ continue
51
+
52
+ # Handle different types of position elements
53
+ position = location.find("up:position", ns)
54
+ begin = location.find("up:begin", ns)
55
+ end_elem = location.find("up:end", ns)
56
+
57
+ if position is not None:
58
+ pos = int(position.get("position"))
59
+ # For single position features
60
+ if feature_type not in annotations:
61
+ annotations[feature_type] = []
62
+ annotations[feature_type].append(
63
+ {"position": pos, "description": description}
64
+ )
65
+ elif begin is not None and end_elem is not None:
66
+ start = int(begin.get("position"))
67
+ end = int(end_elem.get("position"))
68
+ # For range features and disulfide bonds
69
+ if feature_type not in annotations:
70
+ annotations[feature_type] = []
71
+ annotations[feature_type].append(
72
+ {"begin": start, "end": end, "description": description}
73
+ )
74
+
75
+ return protein_sequence, annotations
76
+
77
+
78
+ def create_dataframe(protein_sequence, annotations):
79
+ """
80
+ Creates a Pandas DataFrame from protein sequence and annotations.
81
+ """
82
+ data = []
83
+ for i, residue in enumerate(protein_sequence):
84
+ residue_number = i + 1
85
+ row = {
86
+ "Residue Number": residue_number,
87
+ "Residue code": residue,
88
+ "Secondary structure": "",
89
+ "Domain": "",
90
+ "Pfam domain": "",
91
+ "Disorder": "",
92
+ "Disulfide bridges": "",
93
+ "Glycosylation sites": "",
94
+ "Phosphorylation sites": "",
95
+ "active sites": "",
96
+ "Binding sites": "", # Combined binding sites column
97
+ "modified": "",
98
+ }
99
+ data.append(row)
100
+
101
+ df = pd.DataFrame(data)
102
+
103
+ # Map UniProt feature types to our column names
104
+ feature_mapping = {
105
+ "strand": "Secondary structure",
106
+ "helix": "Secondary structure",
107
+ "turn": "Secondary structure",
108
+ "domain": "Domain",
109
+ "disulfide bond": "Disulfide bridges",
110
+ "glycosylation site": "Glycosylation sites",
111
+ "modified residue": "modified",
112
+ "active site": "active sites",
113
+ "site": "Phosphorylation sites",
114
+ }
115
+
116
+ # Special mappings that need additional processing
117
+ region_mapping = {"pfam": "Pfam domain", "disorder": "Disorder"}
118
+
119
+ for feature_type, values in annotations.items():
120
+ feature_type = feature_type.lower()
121
+
122
+ # Handle disulfide bond pairs
123
+ if feature_type == "disulfide bond":
124
+ for item in values:
125
+ start = item["begin"]
126
+ end = item["end"]
127
+ desc = f"Cys-{end}"
128
+ df.at[start - 1, "Disulfide bridges"] = desc
129
+ desc = f"Cys-{start}"
130
+ df.at[end - 1, "Disulfide bridges"] = desc
131
+
132
+ # Handle glycosylation sites
133
+ elif feature_type == "glycosylation site":
134
+ for item in values:
135
+ pos = item["position"] - 1
136
+ df.at[pos, "Glycosylation sites"] = item["description"]
137
+
138
+ # Handle region features
139
+ elif feature_type == "region":
140
+ for item in values:
141
+ start = item.get("begin", item.get("position"))
142
+ end = item.get("end", item.get("position"))
143
+ if not start:
144
+ continue
145
+
146
+ start = int(start)
147
+ end = int(end) if end else start
148
+ desc = item["description"].lower()
149
+
150
+ # Map to appropriate column based on description
151
+ column = None
152
+ if "pfam" in desc:
153
+ column = "Pfam domain"
154
+ elif "disorder" in desc:
155
+ column = "Disorder"
156
+
157
+ if column:
158
+ for i in range(start - 1, end):
159
+ if i >= len(df):
160
+ continue
161
+ current = df.at[i, column]
162
+ if isinstance(current, str) and current != "" and desc:
163
+ df.at[i, column] = f"{current}; {desc}"
164
+ elif desc:
165
+ df.at[i, column] = desc
166
+
167
+ # Handle binding site features
168
+ elif feature_type == "binding site":
169
+ for item in values:
170
+ start = item.get("begin", item.get("position"))
171
+ end = item.get("end", item.get("position"))
172
+ if not start:
173
+ continue
174
+
175
+ start = int(start)
176
+ end = int(end) if end else start
177
+ desc = item["description"]
178
+
179
+ for i in range(start - 1, end):
180
+ if i >= len(df):
181
+ continue
182
+ current = df.at[i, "Binding sites"]
183
+ if isinstance(current, str) and current != "" and desc:
184
+ df.at[i, "Binding sites"] = f"{current}; {desc}"
185
+ elif desc:
186
+ df.at[i, "Binding sites"] = desc
187
+
188
+ # Handle other features
189
+ else:
190
+ column = feature_mapping.get(feature_type)
191
+ if not column:
192
+ continue
193
+
194
+ for item in values:
195
+ start = item.get("begin", item.get("position"))
196
+ end = item.get("end", item.get("position"))
197
+ if not start:
198
+ continue
199
+
200
+ start = int(start)
201
+ end = int(end) if end else start
202
+
203
+ for i in range(start - 1, end):
204
+ if i >= len(df):
205
+ continue
206
+ if column == "Secondary structure":
207
+ df.at[i, column] = feature_type.upper()
208
+ else:
209
+ current = df.at[i, column]
210
+ desc = item["description"]
211
+ if isinstance(current, str) and current != "" and desc:
212
+ df.at[i, column] = f"{current}; {desc}"
213
+ elif desc:
214
+ df.at[i, column] = desc
215
+
216
+ return df