Spaces:
Sleeping
Sleeping
fixes
Browse files- README.txt +0 -43
- app.py +54 -61
- test/P01308.xml +0 -0
- test/P07550.xml +0 -0
README.txt
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
# Protein Sequence Table
|
| 2 |
-
|
| 3 |
-
A Gradio-based web application that reformats protein sequences based on UniProt IDs and displays detailed annotations in a structured format.
|
| 4 |
-
|
| 5 |
-
## Features
|
| 6 |
-
|
| 7 |
-
The application retrieves protein data from UniProt and presents the following information for each residue:
|
| 8 |
-
- Position number in the sequence
|
| 9 |
-
- Amino acid (single-letter code)
|
| 10 |
-
- Secondary structure annotation
|
| 11 |
-
- Associated Pfam domain
|
| 12 |
-
- Disorder prediction
|
| 13 |
-
- Participation in disulfide bridges
|
| 14 |
-
- Post-translational modifications:
|
| 15 |
-
* Glycosylation sites
|
| 16 |
-
* Phosphorylation sites
|
| 17 |
-
- Functional annotations:
|
| 18 |
-
* Active sites
|
| 19 |
-
* Metal binding sites
|
| 20 |
-
* DNA binding regions
|
| 21 |
-
* RNA binding regions
|
| 22 |
-
* Ligand binding sites
|
| 23 |
-
* Other modifications
|
| 24 |
-
|
| 25 |
-
## Usage
|
| 26 |
-
|
| 27 |
-
1. Launch the application
|
| 28 |
-
2. Enter a valid UniProt ID (e.g., P53_HUMAN) in the input field
|
| 29 |
-
3. Click "Submit" to generate the analysis
|
| 30 |
-
4. Results will be displayed in a interactive data frame format
|
| 31 |
-
|
| 32 |
-
## Requirements
|
| 33 |
-
|
| 34 |
-
- Python 3.7+
|
| 35 |
-
- Gradio
|
| 36 |
-
- Pandas
|
| 37 |
-
- Requests
|
| 38 |
-
- XML parsing libraries
|
| 39 |
-
|
| 40 |
-
## Note
|
| 41 |
-
|
| 42 |
-
The application processes UniProt's XML format to extract annotations.
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -18,62 +18,58 @@ def get_uniprot_data(uniprot_id):
|
|
| 18 |
- annotations: A dictionary containing annotations.
|
| 19 |
- error_message: An error message if something goes wrong, otherwise None
|
| 20 |
"""
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
protein_sequence = sequence_elem.text.strip()
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
feature_type
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
annotations[feature_type] = []
|
| 57 |
-
annotations[feature_type].append({
|
| 58 |
-
'position': pos,
|
| 59 |
-
'description': description
|
| 60 |
-
})
|
| 61 |
-
elif begin is not None and end_elem is not None:
|
| 62 |
-
start = int(begin.get('position'))
|
| 63 |
-
end = int(end_elem.get('position'))
|
| 64 |
-
# For range features and disulfide bonds
|
| 65 |
-
if feature_type not in annotations:
|
| 66 |
-
annotations[feature_type] = []
|
| 67 |
-
annotations[feature_type].append({
|
| 68 |
-
'begin': start,
|
| 69 |
-
'end': end,
|
| 70 |
-
'description': description
|
| 71 |
-
})
|
| 72 |
-
|
| 73 |
-
return protein_sequence, annotations, None
|
| 74 |
|
| 75 |
-
|
| 76 |
-
return None, None, f"Error fetching or processing data from UniProt: {e}"
|
| 77 |
|
| 78 |
def create_dataframe(protein_sequence, annotations):
|
| 79 |
"""
|
|
@@ -127,9 +123,9 @@ def create_dataframe(protein_sequence, annotations):
|
|
| 127 |
for item in values:
|
| 128 |
start = item['begin']
|
| 129 |
end = item['end']
|
| 130 |
-
desc = f"
|
| 131 |
df.at[start-1, 'Disulfide bridges'] = desc
|
| 132 |
-
desc = f"
|
| 133 |
df.at[end-1, 'Disulfide bridges'] = desc
|
| 134 |
|
| 135 |
# Handle glycosylation sites
|
|
@@ -228,10 +224,7 @@ def process_uniprot_id(uniprot_id):
|
|
| 228 |
Returns:
|
| 229 |
A Pandas DataFrame or an error message.
|
| 230 |
"""
|
| 231 |
-
protein_sequence, annotations
|
| 232 |
-
|
| 233 |
-
if error_message:
|
| 234 |
-
return error_message
|
| 235 |
|
| 236 |
if protein_sequence and annotations:
|
| 237 |
df = create_dataframe(protein_sequence, annotations)
|
|
@@ -255,17 +248,17 @@ with gr.Blocks() as demo:
|
|
| 255 |
gr.Examples(
|
| 256 |
examples=[
|
| 257 |
["P06280"], # Alpha-galactosidase A
|
| 258 |
-
["
|
| 259 |
["P01308"], # Insulin
|
| 260 |
["Q8WZ42"], # Titin
|
| 261 |
-
["P04637"], # p53 (alternate entry)
|
| 262 |
["P0DTC2"], # SARS-CoV-2 Spike protein
|
| 263 |
],
|
|
|
|
| 264 |
inputs=input_text,
|
| 265 |
label="Example UniProt IDs"
|
| 266 |
)
|
| 267 |
|
| 268 |
-
output_df = gr.Dataframe()
|
| 269 |
|
| 270 |
submit_btn.click(
|
| 271 |
fn=process_uniprot_id,
|
|
|
|
| 18 |
- annotations: A dictionary containing annotations.
|
| 19 |
- error_message: An error message if something goes wrong, otherwise None
|
| 20 |
"""
|
| 21 |
+
# Fetch XML data
|
| 22 |
+
url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
|
| 23 |
+
response = urlopen(url).read().decode('utf-8')
|
| 24 |
+
|
| 25 |
+
# Parse XML with namespace
|
| 26 |
+
root = ET.fromstring(response)
|
| 27 |
+
ns = {'up': 'http://uniprot.org/uniprot'}
|
| 28 |
+
|
| 29 |
+
# Get sequence
|
| 30 |
+
sequence_elem = root.find("./up:entry/up:sequence", ns)
|
| 31 |
+
if sequence_elem is None:
|
| 32 |
+
return None, None, "Could not find sequence in UniProt response"
|
| 33 |
+
protein_sequence = sequence_elem.text.strip()
|
| 34 |
+
|
| 35 |
+
# Get feature annotations
|
| 36 |
+
annotations = {}
|
| 37 |
+
for feature in root.findall(".//up:feature", ns):
|
| 38 |
+
feature_type = feature.get('type')
|
| 39 |
+
description = feature.get('description', '')
|
| 40 |
|
| 41 |
+
# Get position information
|
| 42 |
+
location = feature.find("up:location", ns)
|
| 43 |
+
if location is None:
|
| 44 |
+
continue
|
| 45 |
|
| 46 |
+
# Handle different types of position elements
|
| 47 |
+
position = location.find("up:position", ns)
|
| 48 |
+
begin = location.find("up:begin", ns)
|
| 49 |
+
end_elem = location.find("up:end", ns)
|
|
|
|
| 50 |
|
| 51 |
+
if position is not None:
|
| 52 |
+
pos = int(position.get('position'))
|
| 53 |
+
# For single position features
|
| 54 |
+
if feature_type not in annotations:
|
| 55 |
+
annotations[feature_type] = []
|
| 56 |
+
annotations[feature_type].append({
|
| 57 |
+
'position': pos,
|
| 58 |
+
'description': description
|
| 59 |
+
})
|
| 60 |
+
elif begin is not None and end_elem is not None:
|
| 61 |
+
start = int(begin.get('position'))
|
| 62 |
+
end = int(end_elem.get('position'))
|
| 63 |
+
# For range features and disulfide bonds
|
| 64 |
+
if feature_type not in annotations:
|
| 65 |
+
annotations[feature_type] = []
|
| 66 |
+
annotations[feature_type].append({
|
| 67 |
+
'begin': start,
|
| 68 |
+
'end': end,
|
| 69 |
+
'description': description
|
| 70 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
return protein_sequence, annotations
|
|
|
|
| 73 |
|
| 74 |
def create_dataframe(protein_sequence, annotations):
|
| 75 |
"""
|
|
|
|
| 123 |
for item in values:
|
| 124 |
start = item['begin']
|
| 125 |
end = item['end']
|
| 126 |
+
desc = f"Cys-{end}"
|
| 127 |
df.at[start-1, 'Disulfide bridges'] = desc
|
| 128 |
+
desc = f"Cys-{start}"
|
| 129 |
df.at[end-1, 'Disulfide bridges'] = desc
|
| 130 |
|
| 131 |
# Handle glycosylation sites
|
|
|
|
| 224 |
Returns:
|
| 225 |
A Pandas DataFrame or an error message.
|
| 226 |
"""
|
| 227 |
+
protein_sequence, annotations = get_uniprot_data(uniprot_id)
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
if protein_sequence and annotations:
|
| 230 |
df = create_dataframe(protein_sequence, annotations)
|
|
|
|
| 248 |
gr.Examples(
|
| 249 |
examples=[
|
| 250 |
["P06280"], # Alpha-galactosidase A
|
| 251 |
+
["P07550"], # beta-2 AR
|
| 252 |
["P01308"], # Insulin
|
| 253 |
["Q8WZ42"], # Titin
|
|
|
|
| 254 |
["P0DTC2"], # SARS-CoV-2 Spike protein
|
| 255 |
],
|
| 256 |
+
example_labels=["Alpha-galactosidase A", "Beta-2 adrenergic receptor", "Insulin", "Titin", "SARS-CoV-2 Spike protein"],
|
| 257 |
inputs=input_text,
|
| 258 |
label="Example UniProt IDs"
|
| 259 |
)
|
| 260 |
|
| 261 |
+
output_df = gr.Dataframe(interactive=False)
|
| 262 |
|
| 263 |
submit_btn.click(
|
| 264 |
fn=process_uniprot_id,
|
test/P01308.xml
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
test/P07550.xml
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|