Spaces:
Sleeping
Sleeping
File size: 7,729 Bytes
cb99418 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import os
import xml.etree.ElementTree as ET
from urllib.request import urlopen
import pandas as pd
def get_uniprot_data(uniprot_id):
"""
Fetches protein sequence and annotation data from UniProt in XML format.
Args:
uniprot_id: The UniProt ID of the protein.
Returns:
A tuple containing:
- protein_sequence: The protein sequence as a string.
- annotations: A dictionary containing annotations.
- error_message: An error message if something goes wrong, otherwise None
"""
# Fetch XML data
local_file_path = os.path.join("test", f"{uniprot_id}.xml")
if os.path.exists(local_file_path):
with open(local_file_path, "r", encoding="utf-8") as file:
response = file.read()
else:
# Fetch XML data from UniProt
url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
response = urlopen(url).read().decode("utf-8")
# Parse XML with namespace
root = ET.fromstring(response)
ns = {"up": "http://uniprot.org/uniprot"}
# Get sequence
sequence_elem = root.find("./up:entry/up:sequence", ns)
if sequence_elem is None:
return None, None, "Could not find sequence in UniProt response"
protein_sequence = sequence_elem.text.strip()
# Get feature annotations
annotations = {}
for feature in root.findall(".//up:feature", ns):
feature_type = feature.get("type")
description = feature.get("description", "")
# Get position information
location = feature.find("up:location", ns)
if location is None:
continue
# Handle different types of position elements
position = location.find("up:position", ns)
begin = location.find("up:begin", ns)
end_elem = location.find("up:end", ns)
if position is not None:
pos = int(position.get("position"))
# For single position features
if feature_type not in annotations:
annotations[feature_type] = []
annotations[feature_type].append(
{"position": pos, "description": description}
)
elif begin is not None and end_elem is not None:
start = int(begin.get("position"))
end = int(end_elem.get("position"))
# For range features and disulfide bonds
if feature_type not in annotations:
annotations[feature_type] = []
annotations[feature_type].append(
{"begin": start, "end": end, "description": description}
)
return protein_sequence, annotations
def create_dataframe(protein_sequence, annotations):
"""
Creates a Pandas DataFrame from protein sequence and annotations.
"""
data = []
for i, residue in enumerate(protein_sequence):
residue_number = i + 1
row = {
"Residue Number": residue_number,
"Residue code": residue,
"Secondary structure": "",
"Domain": "",
"Pfam domain": "",
"Disorder": "",
"Disulfide bridges": "",
"Glycosylation sites": "",
"Phosphorylation sites": "",
"active sites": "",
"Binding sites": "", # Combined binding sites column
"modified": "",
}
data.append(row)
df = pd.DataFrame(data)
# Map UniProt feature types to our column names
feature_mapping = {
"strand": "Secondary structure",
"helix": "Secondary structure",
"turn": "Secondary structure",
"domain": "Domain",
"disulfide bond": "Disulfide bridges",
"glycosylation site": "Glycosylation sites",
"modified residue": "modified",
"active site": "active sites",
"site": "Phosphorylation sites",
}
# Special mappings that need additional processing
region_mapping = {"pfam": "Pfam domain", "disorder": "Disorder"}
for feature_type, values in annotations.items():
feature_type = feature_type.lower()
# Handle disulfide bond pairs
if feature_type == "disulfide bond":
for item in values:
start = item["begin"]
end = item["end"]
desc = f"Cys-{end}"
df.at[start - 1, "Disulfide bridges"] = desc
desc = f"Cys-{start}"
df.at[end - 1, "Disulfide bridges"] = desc
# Handle glycosylation sites
elif feature_type == "glycosylation site":
for item in values:
pos = item["position"] - 1
df.at[pos, "Glycosylation sites"] = item["description"]
# Handle region features
elif feature_type == "region":
for item in values:
start = item.get("begin", item.get("position"))
end = item.get("end", item.get("position"))
if not start:
continue
start = int(start)
end = int(end) if end else start
desc = item["description"].lower()
# Map to appropriate column based on description
column = None
if "pfam" in desc:
column = "Pfam domain"
elif "disorder" in desc:
column = "Disorder"
if column:
for i in range(start - 1, end):
if i >= len(df):
continue
current = df.at[i, column]
if isinstance(current, str) and current != "" and desc:
df.at[i, column] = f"{current}; {desc}"
elif desc:
df.at[i, column] = desc
# Handle binding site features
elif feature_type == "binding site":
for item in values:
start = item.get("begin", item.get("position"))
end = item.get("end", item.get("position"))
if not start:
continue
start = int(start)
end = int(end) if end else start
desc = item["description"]
for i in range(start - 1, end):
if i >= len(df):
continue
current = df.at[i, "Binding sites"]
if isinstance(current, str) and current != "" and desc:
df.at[i, "Binding sites"] = f"{current}; {desc}"
elif desc:
df.at[i, "Binding sites"] = desc
# Handle other features
else:
column = feature_mapping.get(feature_type)
if not column:
continue
for item in values:
start = item.get("begin", item.get("position"))
end = item.get("end", item.get("position"))
if not start:
continue
start = int(start)
end = int(end) if end else start
for i in range(start - 1, end):
if i >= len(df):
continue
if column == "Secondary structure":
df.at[i, column] = feature_type.upper()
else:
current = df.at[i, column]
desc = item["description"]
if isinstance(current, str) and current != "" and desc:
df.at[i, column] = f"{current}; {desc}"
elif desc:
df.at[i, column] = desc
return df |