tonigi commited on
Commit
2b158eb
·
1 Parent(s): 1fb077a
Files changed (4) hide show
  1. README.txt +0 -43
  2. app.py +54 -61
  3. test/P01308.xml +0 -0
  4. test/P07550.xml +0 -0
README.txt DELETED
@@ -1,43 +0,0 @@
1
- # Protein Sequence Table
2
-
3
- A Gradio-based web application that reformats protein sequences based on UniProt IDs and displays detailed annotations in a structured format.
4
-
5
- ## Features
6
-
7
- The application retrieves protein data from UniProt and presents the following information for each residue:
8
- - Position number in the sequence
9
- - Amino acid (single-letter code)
10
- - Secondary structure annotation
11
- - Associated Pfam domain
12
- - Disorder prediction
13
- - Participation in disulfide bridges
14
- - Post-translational modifications:
15
- * Glycosylation sites
16
- * Phosphorylation sites
17
- - Functional annotations:
18
- * Active sites
19
- * Metal binding sites
20
- * DNA binding regions
21
- * RNA binding regions
22
- * Ligand binding sites
23
- * Other modifications
24
-
25
- ## Usage
26
-
27
- 1. Launch the application
28
- 2. Enter a valid UniProt ID (e.g., P53_HUMAN) in the input field
29
- 3. Click "Submit" to generate the analysis
30
- 4. Results will be displayed in a interactive data frame format
31
-
32
- ## Requirements
33
-
34
- - Python 3.7+
35
- - Gradio
36
- - Pandas
37
- - Requests
38
- - XML parsing libraries
39
-
40
- ## Note
41
-
42
- The application processes UniProt's XML format to extract annotations.
43
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -18,62 +18,58 @@ def get_uniprot_data(uniprot_id):
18
  - annotations: A dictionary containing annotations.
19
  - error_message: An error message if something goes wrong, otherwise None
20
  """
21
- try:
22
- # Fetch XML data
23
- url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
24
- response = urlopen(url).read().decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # Parse XML with namespace
27
- root = ET.fromstring(response)
28
- ns = {'up': 'http://uniprot.org/uniprot'}
 
29
 
30
- # Get sequence
31
- sequence_elem = root.find(".//up:sequence", ns)
32
- if sequence_elem is None:
33
- return None, None, "Could not find sequence in UniProt response"
34
- protein_sequence = sequence_elem.text.strip()
35
 
36
- # Get feature annotations
37
- annotations = {}
38
- for feature in root.findall(".//up:feature", ns):
39
- feature_type = feature.get('type')
40
- description = feature.get('description', '')
41
-
42
- # Get position information
43
- location = feature.find("up:location", ns)
44
- if location is None:
45
- continue
46
-
47
- # Handle different types of position elements
48
- position = location.find("up:position", ns)
49
- begin = location.find("up:begin", ns)
50
- end_elem = location.find("up:end", ns)
51
-
52
- if position is not None:
53
- pos = int(position.get('position'))
54
- # For single position features
55
- if feature_type not in annotations:
56
- annotations[feature_type] = []
57
- annotations[feature_type].append({
58
- 'position': pos,
59
- 'description': description
60
- })
61
- elif begin is not None and end_elem is not None:
62
- start = int(begin.get('position'))
63
- end = int(end_elem.get('position'))
64
- # For range features and disulfide bonds
65
- if feature_type not in annotations:
66
- annotations[feature_type] = []
67
- annotations[feature_type].append({
68
- 'begin': start,
69
- 'end': end,
70
- 'description': description
71
- })
72
-
73
- return protein_sequence, annotations, None
74
 
75
- except Exception as e:
76
- return None, None, f"Error fetching or processing data from UniProt: {e}"
77
 
78
  def create_dataframe(protein_sequence, annotations):
79
  """
@@ -127,9 +123,9 @@ def create_dataframe(protein_sequence, annotations):
127
  for item in values:
128
  start = item['begin']
129
  end = item['end']
130
- desc = f"Disulfide bridge with Cys-{end}"
131
  df.at[start-1, 'Disulfide bridges'] = desc
132
- desc = f"Disulfide bridge with Cys-{start}"
133
  df.at[end-1, 'Disulfide bridges'] = desc
134
 
135
  # Handle glycosylation sites
@@ -228,10 +224,7 @@ def process_uniprot_id(uniprot_id):
228
  Returns:
229
  A Pandas DataFrame or an error message.
230
  """
231
- protein_sequence, annotations, error_message = get_uniprot_data(uniprot_id)
232
-
233
- if error_message:
234
- return error_message
235
 
236
  if protein_sequence and annotations:
237
  df = create_dataframe(protein_sequence, annotations)
@@ -255,17 +248,17 @@ with gr.Blocks() as demo:
255
  gr.Examples(
256
  examples=[
257
  ["P06280"], # Alpha-galactosidase A
258
- ["P04637"], # Tumor protein p53
259
  ["P01308"], # Insulin
260
  ["Q8WZ42"], # Titin
261
- ["P04637"], # p53 (alternate entry)
262
  ["P0DTC2"], # SARS-CoV-2 Spike protein
263
  ],
 
264
  inputs=input_text,
265
  label="Example UniProt IDs"
266
  )
267
 
268
- output_df = gr.Dataframe()
269
 
270
  submit_btn.click(
271
  fn=process_uniprot_id,
 
18
  - annotations: A dictionary containing annotations.
19
  - error_message: An error message if something goes wrong, otherwise None
20
  """
21
+ # Fetch XML data
22
+ url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
23
+ response = urlopen(url).read().decode('utf-8')
24
+
25
+ # Parse XML with namespace
26
+ root = ET.fromstring(response)
27
+ ns = {'up': 'http://uniprot.org/uniprot'}
28
+
29
+ # Get sequence
30
+ sequence_elem = root.find("./up:entry/up:sequence", ns)
31
+ if sequence_elem is None:
32
+ return None, None, "Could not find sequence in UniProt response"
33
+ protein_sequence = sequence_elem.text.strip()
34
+
35
+ # Get feature annotations
36
+ annotations = {}
37
+ for feature in root.findall(".//up:feature", ns):
38
+ feature_type = feature.get('type')
39
+ description = feature.get('description', '')
40
 
41
+ # Get position information
42
+ location = feature.find("up:location", ns)
43
+ if location is None:
44
+ continue
45
 
46
+ # Handle different types of position elements
47
+ position = location.find("up:position", ns)
48
+ begin = location.find("up:begin", ns)
49
+ end_elem = location.find("up:end", ns)
 
50
 
51
+ if position is not None:
52
+ pos = int(position.get('position'))
53
+ # For single position features
54
+ if feature_type not in annotations:
55
+ annotations[feature_type] = []
56
+ annotations[feature_type].append({
57
+ 'position': pos,
58
+ 'description': description
59
+ })
60
+ elif begin is not None and end_elem is not None:
61
+ start = int(begin.get('position'))
62
+ end = int(end_elem.get('position'))
63
+ # For range features and disulfide bonds
64
+ if feature_type not in annotations:
65
+ annotations[feature_type] = []
66
+ annotations[feature_type].append({
67
+ 'begin': start,
68
+ 'end': end,
69
+ 'description': description
70
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ return protein_sequence, annotations
 
73
 
74
  def create_dataframe(protein_sequence, annotations):
75
  """
 
123
  for item in values:
124
  start = item['begin']
125
  end = item['end']
126
+ desc = f"Cys-{end}"
127
  df.at[start-1, 'Disulfide bridges'] = desc
128
+ desc = f"Cys-{start}"
129
  df.at[end-1, 'Disulfide bridges'] = desc
130
 
131
  # Handle glycosylation sites
 
224
  Returns:
225
  A Pandas DataFrame or an error message.
226
  """
227
+ protein_sequence, annotations = get_uniprot_data(uniprot_id)
 
 
 
228
 
229
  if protein_sequence and annotations:
230
  df = create_dataframe(protein_sequence, annotations)
 
248
  gr.Examples(
249
  examples=[
250
  ["P06280"], # Alpha-galactosidase A
251
+ ["P07550"], # beta-2 AR
252
  ["P01308"], # Insulin
253
  ["Q8WZ42"], # Titin
 
254
  ["P0DTC2"], # SARS-CoV-2 Spike protein
255
  ],
256
+ example_labels=["Alpha-galactosidase A", "Beta-2 adrenergic receptor", "Insulin", "Titin", "SARS-CoV-2 Spike protein"],
257
  inputs=input_text,
258
  label="Example UniProt IDs"
259
  )
260
 
261
+ output_df = gr.Dataframe(interactive=False)
262
 
263
  submit_btn.click(
264
  fn=process_uniprot_id,
test/P01308.xml ADDED
The diff for this file is too large to render. See raw diff
 
test/P07550.xml ADDED
The diff for this file is too large to render. See raw diff