Spaces:

tonigi
/

sequencetable

Sleeping

File size: 7,729 Bytes

cb99418

import os
import xml.etree.ElementTree as ET
from urllib.request import urlopen

import pandas as pd


def get_uniprot_data(uniprot_id):
    """
    Fetches protein sequence and annotation data from UniProt in XML format.

    Args:
        uniprot_id: The UniProt ID of the protein.

    Returns:
        A tuple containing:
        - protein_sequence: The protein sequence as a string.
        - annotations: A dictionary containing annotations.
        - error_message: An error message if something goes wrong, otherwise None
    """
    # Fetch XML data
    local_file_path = os.path.join("test", f"{uniprot_id}.xml")
    if os.path.exists(local_file_path):
        with open(local_file_path, "r", encoding="utf-8") as file:
            response = file.read()
    else:
        # Fetch XML data from UniProt
        url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
        response = urlopen(url).read().decode("utf-8")

    # Parse XML with namespace
    root = ET.fromstring(response)
    ns = {"up": "http://uniprot.org/uniprot"}

    # Get sequence
    sequence_elem = root.find("./up:entry/up:sequence", ns)
    if sequence_elem is None:
        return None, None, "Could not find sequence in UniProt response"
    protein_sequence = sequence_elem.text.strip()

    # Get feature annotations
    annotations = {}
    for feature in root.findall(".//up:feature", ns):
        feature_type = feature.get("type")
        description = feature.get("description", "")

        # Get position information
        location = feature.find("up:location", ns)
        if location is None:
            continue

        # Handle different types of position elements
        position = location.find("up:position", ns)
        begin = location.find("up:begin", ns)
        end_elem = location.find("up:end", ns)

        if position is not None:
            pos = int(position.get("position"))
            # For single position features
            if feature_type not in annotations:
                annotations[feature_type] = []
            annotations[feature_type].append(
                {"position": pos, "description": description}
            )
        elif begin is not None and end_elem is not None:
            start = int(begin.get("position"))
            end = int(end_elem.get("position"))
            # For range features and disulfide bonds
            if feature_type not in annotations:
                annotations[feature_type] = []
            annotations[feature_type].append(
                {"begin": start, "end": end, "description": description}
            )

    return protein_sequence, annotations


def create_dataframe(protein_sequence, annotations):
    """
    Creates a Pandas DataFrame from protein sequence and annotations.
    """
    data = []
    for i, residue in enumerate(protein_sequence):
        residue_number = i + 1
        row = {
            "Residue Number": residue_number,
            "Residue code": residue,
            "Secondary structure": "",
            "Domain": "",
            "Pfam domain": "",
            "Disorder": "",
            "Disulfide bridges": "",
            "Glycosylation sites": "",
            "Phosphorylation sites": "",
            "active sites": "",
            "Binding sites": "",  # Combined binding sites column
            "modified": "",
        }
        data.append(row)

    df = pd.DataFrame(data)

    # Map UniProt feature types to our column names
    feature_mapping = {
        "strand": "Secondary structure",
        "helix": "Secondary structure",
        "turn": "Secondary structure",
        "domain": "Domain",
        "disulfide bond": "Disulfide bridges",
        "glycosylation site": "Glycosylation sites",
        "modified residue": "modified",
        "active site": "active sites",
        "site": "Phosphorylation sites",
    }

    # Special mappings that need additional processing
    region_mapping = {"pfam": "Pfam domain", "disorder": "Disorder"}

    for feature_type, values in annotations.items():
        feature_type = feature_type.lower()

        # Handle disulfide bond pairs
        if feature_type == "disulfide bond":
            for item in values:
                start = item["begin"]
                end = item["end"]
                desc = f"Cys-{end}"
                df.at[start - 1, "Disulfide bridges"] = desc
                desc = f"Cys-{start}"
                df.at[end - 1, "Disulfide bridges"] = desc

        # Handle glycosylation sites
        elif feature_type == "glycosylation site":
            for item in values:
                pos = item["position"] - 1
                df.at[pos, "Glycosylation sites"] = item["description"]

        # Handle region features
        elif feature_type == "region":
            for item in values:
                start = item.get("begin", item.get("position"))
                end = item.get("end", item.get("position"))
                if not start:
                    continue

                start = int(start)
                end = int(end) if end else start
                desc = item["description"].lower()

                # Map to appropriate column based on description
                column = None
                if "pfam" in desc:
                    column = "Pfam domain"
                elif "disorder" in desc:
                    column = "Disorder"

                if column:
                    for i in range(start - 1, end):
                        if i >= len(df):
                            continue
                        current = df.at[i, column]
                        if isinstance(current, str) and current != "" and desc:
                            df.at[i, column] = f"{current}; {desc}"
                        elif desc:
                            df.at[i, column] = desc

        # Handle binding site features
        elif feature_type == "binding site":
            for item in values:
                start = item.get("begin", item.get("position"))
                end = item.get("end", item.get("position"))
                if not start:
                    continue

                start = int(start)
                end = int(end) if end else start
                desc = item["description"]

                for i in range(start - 1, end):
                    if i >= len(df):
                        continue
                    current = df.at[i, "Binding sites"]
                    if isinstance(current, str) and current != "" and desc:
                        df.at[i, "Binding sites"] = f"{current}; {desc}"
                    elif desc:
                        df.at[i, "Binding sites"] = desc

        # Handle other features
        else:
            column = feature_mapping.get(feature_type)
            if not column:
                continue

            for item in values:
                start = item.get("begin", item.get("position"))
                end = item.get("end", item.get("position"))
                if not start:
                    continue

                start = int(start)
                end = int(end) if end else start

                for i in range(start - 1, end):
                    if i >= len(df):
                        continue
                    if column == "Secondary structure":
                        df.at[i, column] = feature_type.upper()
                    else:
                        current = df.at[i, column]
                        desc = item["description"]
                        if isinstance(current, str) and current != "" and desc:
                            df.at[i, column] = f"{current}; {desc}"
                        elif desc:
                            df.at[i, column] = desc

    return df