import os import xml.etree.ElementTree as ET from urllib.request import urlopen import pandas as pd def get_uniprot_data(uniprot_id): """ Fetches protein sequence and annotation data from UniProt in XML format. Args: uniprot_id: The UniProt ID of the protein. Returns: A tuple containing: - protein_sequence: The protein sequence as a string. - annotations: A dictionary containing annotations. - error_message: An error message if something goes wrong, otherwise None """ # Fetch XML data local_file_path = os.path.join("test", f"{uniprot_id}.xml") if os.path.exists(local_file_path): with open(local_file_path, "r", encoding="utf-8") as file: response = file.read() else: # Fetch XML data from UniProt url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml" response = urlopen(url).read().decode("utf-8") # Parse XML with namespace root = ET.fromstring(response) ns = {"up": "http://uniprot.org/uniprot"} # Get sequence sequence_elem = root.find("./up:entry/up:sequence", ns) if sequence_elem is None: return None, None, "Could not find sequence in UniProt response" protein_sequence = sequence_elem.text.strip() # Get feature annotations annotations = {} for feature in root.findall(".//up:feature", ns): feature_type = feature.get("type") description = feature.get("description", "") # Get position information location = feature.find("up:location", ns) if location is None: continue # Handle different types of position elements position = location.find("up:position", ns) begin = location.find("up:begin", ns) end_elem = location.find("up:end", ns) if position is not None: pos = int(position.get("position")) # For single position features if feature_type not in annotations: annotations[feature_type] = [] annotations[feature_type].append( {"position": pos, "description": description} ) elif begin is not None and end_elem is not None: start = int(begin.get("position")) end = int(end_elem.get("position")) # For range features and disulfide bonds if feature_type not in annotations: annotations[feature_type] = [] annotations[feature_type].append( {"begin": start, "end": end, "description": description} ) return protein_sequence, annotations def create_dataframe(protein_sequence, annotations): """ Creates a Pandas DataFrame from protein sequence and annotations. """ data = [] for i, residue in enumerate(protein_sequence): residue_number = i + 1 row = { "Residue Number": residue_number, "Residue code": residue, "Secondary structure": "", "Domain": "", "Pfam domain": "", "Disorder": "", "Disulfide bridges": "", "Glycosylation sites": "", "Phosphorylation sites": "", "active sites": "", "Binding sites": "", # Combined binding sites column "modified": "", } data.append(row) df = pd.DataFrame(data) # Map UniProt feature types to our column names feature_mapping = { "strand": "Secondary structure", "helix": "Secondary structure", "turn": "Secondary structure", "domain": "Domain", "disulfide bond": "Disulfide bridges", "glycosylation site": "Glycosylation sites", "modified residue": "modified", "active site": "active sites", "site": "Phosphorylation sites", } # Special mappings that need additional processing region_mapping = {"pfam": "Pfam domain", "disorder": "Disorder"} for feature_type, values in annotations.items(): feature_type = feature_type.lower() # Handle disulfide bond pairs if feature_type == "disulfide bond": for item in values: start = item["begin"] end = item["end"] desc = f"Cys-{end}" df.at[start - 1, "Disulfide bridges"] = desc desc = f"Cys-{start}" df.at[end - 1, "Disulfide bridges"] = desc # Handle glycosylation sites elif feature_type == "glycosylation site": for item in values: pos = item["position"] - 1 df.at[pos, "Glycosylation sites"] = item["description"] # Handle region features elif feature_type == "region": for item in values: start = item.get("begin", item.get("position")) end = item.get("end", item.get("position")) if not start: continue start = int(start) end = int(end) if end else start desc = item["description"].lower() # Map to appropriate column based on description column = None if "pfam" in desc: column = "Pfam domain" elif "disorder" in desc: column = "Disorder" if column: for i in range(start - 1, end): if i >= len(df): continue current = df.at[i, column] if isinstance(current, str) and current != "" and desc: df.at[i, column] = f"{current}; {desc}" elif desc: df.at[i, column] = desc # Handle binding site features elif feature_type == "binding site": for item in values: start = item.get("begin", item.get("position")) end = item.get("end", item.get("position")) if not start: continue start = int(start) end = int(end) if end else start desc = item["description"] for i in range(start - 1, end): if i >= len(df): continue current = df.at[i, "Binding sites"] if isinstance(current, str) and current != "" and desc: df.at[i, "Binding sites"] = f"{current}; {desc}" elif desc: df.at[i, "Binding sites"] = desc # Handle other features else: column = feature_mapping.get(feature_type) if not column: continue for item in values: start = item.get("begin", item.get("position")) end = item.get("end", item.get("position")) if not start: continue start = int(start) end = int(end) if end else start for i in range(start - 1, end): if i >= len(df): continue if column == "Secondary structure": df.at[i, column] = feature_type.upper() else: current = df.at[i, column] desc = item["description"] if isinstance(current, str) and current != "" and desc: df.at[i, column] = f"{current}; {desc}" elif desc: df.at[i, column] = desc return df