Spaces:
Sleeping
Sleeping
| import os | |
| import xml.etree.ElementTree as ET | |
| from urllib.request import urlopen | |
| import pandas as pd | |
| def get_uniprot_data(uniprot_id): | |
| """ | |
| Fetches protein sequence and annotation data from UniProt in XML format. | |
| Args: | |
| uniprot_id: The UniProt ID of the protein. | |
| Returns: | |
| A tuple containing: | |
| - protein_sequence: The protein sequence as a string. | |
| - annotations: A dictionary containing annotations. | |
| - error_message: An error message if something goes wrong, otherwise None | |
| """ | |
| # Fetch XML data | |
| local_file_path = os.path.join("test", f"{uniprot_id}.xml") | |
| if os.path.exists(local_file_path): | |
| with open(local_file_path, "r", encoding="utf-8") as file: | |
| response = file.read() | |
| else: | |
| # Fetch XML data from UniProt | |
| url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml" | |
| response = urlopen(url).read().decode("utf-8") | |
| # Parse XML with namespace | |
| root = ET.fromstring(response) | |
| ns = {"up": "http://uniprot.org/uniprot"} | |
| # Get sequence | |
| sequence_elem = root.find("./up:entry/up:sequence", ns) | |
| if sequence_elem is None: | |
| return None, None, "Could not find sequence in UniProt response" | |
| protein_sequence = sequence_elem.text.strip() | |
| # Get feature annotations | |
| annotations = {} | |
| for feature in root.findall(".//up:feature", ns): | |
| feature_type = feature.get("type") | |
| description = feature.get("description", "") | |
| # Get position information | |
| location = feature.find("up:location", ns) | |
| if location is None: | |
| continue | |
| # Handle different types of position elements | |
| position = location.find("up:position", ns) | |
| begin = location.find("up:begin", ns) | |
| end_elem = location.find("up:end", ns) | |
| if position is not None: | |
| pos = int(position.get("position")) | |
| # For single position features | |
| if feature_type not in annotations: | |
| annotations[feature_type] = [] | |
| annotations[feature_type].append( | |
| {"position": pos, "description": description} | |
| ) | |
| elif begin is not None and end_elem is not None: | |
| start = int(begin.get("position")) | |
| end = int(end_elem.get("position")) | |
| # For range features and disulfide bonds | |
| if feature_type not in annotations: | |
| annotations[feature_type] = [] | |
| annotations[feature_type].append( | |
| {"begin": start, "end": end, "description": description} | |
| ) | |
| return protein_sequence, annotations | |
| def create_dataframe(protein_sequence, annotations): | |
| """ | |
| Creates a Pandas DataFrame from protein sequence and annotations. | |
| """ | |
| data = [] | |
| for i, residue in enumerate(protein_sequence): | |
| residue_number = i + 1 | |
| row = { | |
| "Residue Number": residue_number, | |
| "Residue code": residue, | |
| "Secondary structure": "", | |
| "Domain": "", | |
| "Pfam domain": "", | |
| "Disorder": "", | |
| "Disulfide bridges": "", | |
| "Glycosylation sites": "", | |
| "Phosphorylation sites": "", | |
| "active sites": "", | |
| "Binding sites": "", # Combined binding sites column | |
| "modified": "", | |
| } | |
| data.append(row) | |
| df = pd.DataFrame(data) | |
| # Map UniProt feature types to our column names | |
| feature_mapping = { | |
| "strand": "Secondary structure", | |
| "helix": "Secondary structure", | |
| "turn": "Secondary structure", | |
| "domain": "Domain", | |
| "disulfide bond": "Disulfide bridges", | |
| "glycosylation site": "Glycosylation sites", | |
| "modified residue": "modified", | |
| "active site": "active sites", | |
| "site": "Phosphorylation sites", | |
| } | |
| # Special mappings that need additional processing | |
| region_mapping = {"pfam": "Pfam domain", "disorder": "Disorder"} | |
| for feature_type, values in annotations.items(): | |
| feature_type = feature_type.lower() | |
| # Handle disulfide bond pairs | |
| if feature_type == "disulfide bond": | |
| for item in values: | |
| start = item["begin"] | |
| end = item["end"] | |
| desc = f"Cys-{end}" | |
| df.at[start - 1, "Disulfide bridges"] = desc | |
| desc = f"Cys-{start}" | |
| df.at[end - 1, "Disulfide bridges"] = desc | |
| # Handle glycosylation sites | |
| elif feature_type == "glycosylation site": | |
| for item in values: | |
| pos = item["position"] - 1 | |
| df.at[pos, "Glycosylation sites"] = item["description"] | |
| # Handle region features | |
| elif feature_type == "region": | |
| for item in values: | |
| start = item.get("begin", item.get("position")) | |
| end = item.get("end", item.get("position")) | |
| if not start: | |
| continue | |
| start = int(start) | |
| end = int(end) if end else start | |
| desc = item["description"].lower() | |
| # Map to appropriate column based on description | |
| column = None | |
| if "pfam" in desc: | |
| column = "Pfam domain" | |
| elif "disorder" in desc: | |
| column = "Disorder" | |
| if column: | |
| for i in range(start - 1, end): | |
| if i >= len(df): | |
| continue | |
| current = df.at[i, column] | |
| if isinstance(current, str) and current != "" and desc: | |
| df.at[i, column] = f"{current}; {desc}" | |
| elif desc: | |
| df.at[i, column] = desc | |
| # Handle binding site features | |
| elif feature_type == "binding site": | |
| for item in values: | |
| start = item.get("begin", item.get("position")) | |
| end = item.get("end", item.get("position")) | |
| if not start: | |
| continue | |
| start = int(start) | |
| end = int(end) if end else start | |
| desc = item["description"] | |
| for i in range(start - 1, end): | |
| if i >= len(df): | |
| continue | |
| current = df.at[i, "Binding sites"] | |
| if isinstance(current, str) and current != "" and desc: | |
| df.at[i, "Binding sites"] = f"{current}; {desc}" | |
| elif desc: | |
| df.at[i, "Binding sites"] = desc | |
| # Handle other features | |
| else: | |
| column = feature_mapping.get(feature_type) | |
| if not column: | |
| continue | |
| for item in values: | |
| start = item.get("begin", item.get("position")) | |
| end = item.get("end", item.get("position")) | |
| if not start: | |
| continue | |
| start = int(start) | |
| end = int(end) if end else start | |
| for i in range(start - 1, end): | |
| if i >= len(df): | |
| continue | |
| if column == "Secondary structure": | |
| df.at[i, column] = feature_type.upper() | |
| else: | |
| current = df.at[i, column] | |
| desc = item["description"] | |
| if isinstance(current, str) and current != "" and desc: | |
| df.at[i, column] = f"{current}; {desc}" | |
| elif desc: | |
| df.at[i, column] = desc | |
| return df |