sequencetable / uniprot_data.py
tonigi's picture
refact
cb99418
import os
import xml.etree.ElementTree as ET
from urllib.request import urlopen
import pandas as pd
def get_uniprot_data(uniprot_id):
"""
Fetches protein sequence and annotation data from UniProt in XML format.
Args:
uniprot_id: The UniProt ID of the protein.
Returns:
A tuple containing:
- protein_sequence: The protein sequence as a string.
- annotations: A dictionary containing annotations.
- error_message: An error message if something goes wrong, otherwise None
"""
# Fetch XML data
local_file_path = os.path.join("test", f"{uniprot_id}.xml")
if os.path.exists(local_file_path):
with open(local_file_path, "r", encoding="utf-8") as file:
response = file.read()
else:
# Fetch XML data from UniProt
url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
response = urlopen(url).read().decode("utf-8")
# Parse XML with namespace
root = ET.fromstring(response)
ns = {"up": "http://uniprot.org/uniprot"}
# Get sequence
sequence_elem = root.find("./up:entry/up:sequence", ns)
if sequence_elem is None:
return None, None, "Could not find sequence in UniProt response"
protein_sequence = sequence_elem.text.strip()
# Get feature annotations
annotations = {}
for feature in root.findall(".//up:feature", ns):
feature_type = feature.get("type")
description = feature.get("description", "")
# Get position information
location = feature.find("up:location", ns)
if location is None:
continue
# Handle different types of position elements
position = location.find("up:position", ns)
begin = location.find("up:begin", ns)
end_elem = location.find("up:end", ns)
if position is not None:
pos = int(position.get("position"))
# For single position features
if feature_type not in annotations:
annotations[feature_type] = []
annotations[feature_type].append(
{"position": pos, "description": description}
)
elif begin is not None and end_elem is not None:
start = int(begin.get("position"))
end = int(end_elem.get("position"))
# For range features and disulfide bonds
if feature_type not in annotations:
annotations[feature_type] = []
annotations[feature_type].append(
{"begin": start, "end": end, "description": description}
)
return protein_sequence, annotations
def create_dataframe(protein_sequence, annotations):
"""
Creates a Pandas DataFrame from protein sequence and annotations.
"""
data = []
for i, residue in enumerate(protein_sequence):
residue_number = i + 1
row = {
"Residue Number": residue_number,
"Residue code": residue,
"Secondary structure": "",
"Domain": "",
"Pfam domain": "",
"Disorder": "",
"Disulfide bridges": "",
"Glycosylation sites": "",
"Phosphorylation sites": "",
"active sites": "",
"Binding sites": "", # Combined binding sites column
"modified": "",
}
data.append(row)
df = pd.DataFrame(data)
# Map UniProt feature types to our column names
feature_mapping = {
"strand": "Secondary structure",
"helix": "Secondary structure",
"turn": "Secondary structure",
"domain": "Domain",
"disulfide bond": "Disulfide bridges",
"glycosylation site": "Glycosylation sites",
"modified residue": "modified",
"active site": "active sites",
"site": "Phosphorylation sites",
}
# Special mappings that need additional processing
region_mapping = {"pfam": "Pfam domain", "disorder": "Disorder"}
for feature_type, values in annotations.items():
feature_type = feature_type.lower()
# Handle disulfide bond pairs
if feature_type == "disulfide bond":
for item in values:
start = item["begin"]
end = item["end"]
desc = f"Cys-{end}"
df.at[start - 1, "Disulfide bridges"] = desc
desc = f"Cys-{start}"
df.at[end - 1, "Disulfide bridges"] = desc
# Handle glycosylation sites
elif feature_type == "glycosylation site":
for item in values:
pos = item["position"] - 1
df.at[pos, "Glycosylation sites"] = item["description"]
# Handle region features
elif feature_type == "region":
for item in values:
start = item.get("begin", item.get("position"))
end = item.get("end", item.get("position"))
if not start:
continue
start = int(start)
end = int(end) if end else start
desc = item["description"].lower()
# Map to appropriate column based on description
column = None
if "pfam" in desc:
column = "Pfam domain"
elif "disorder" in desc:
column = "Disorder"
if column:
for i in range(start - 1, end):
if i >= len(df):
continue
current = df.at[i, column]
if isinstance(current, str) and current != "" and desc:
df.at[i, column] = f"{current}; {desc}"
elif desc:
df.at[i, column] = desc
# Handle binding site features
elif feature_type == "binding site":
for item in values:
start = item.get("begin", item.get("position"))
end = item.get("end", item.get("position"))
if not start:
continue
start = int(start)
end = int(end) if end else start
desc = item["description"]
for i in range(start - 1, end):
if i >= len(df):
continue
current = df.at[i, "Binding sites"]
if isinstance(current, str) and current != "" and desc:
df.at[i, "Binding sites"] = f"{current}; {desc}"
elif desc:
df.at[i, "Binding sites"] = desc
# Handle other features
else:
column = feature_mapping.get(feature_type)
if not column:
continue
for item in values:
start = item.get("begin", item.get("position"))
end = item.get("end", item.get("position"))
if not start:
continue
start = int(start)
end = int(end) if end else start
for i in range(start - 1, end):
if i >= len(df):
continue
if column == "Secondary structure":
df.at[i, column] = feature_type.upper()
else:
current = df.at[i, column]
desc = item["description"]
if isinstance(current, str) and current != "" and desc:
df.at[i, column] = f"{current}; {desc}"
elif desc:
df.at[i, column] = desc
return df