File size: 7,729 Bytes
cb99418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import os
import xml.etree.ElementTree as ET
from urllib.request import urlopen

import pandas as pd


def get_uniprot_data(uniprot_id):
    """
    Fetches protein sequence and annotation data from UniProt in XML format.

    Args:
        uniprot_id: The UniProt ID of the protein.

    Returns:
        A tuple containing:
        - protein_sequence: The protein sequence as a string.
        - annotations: A dictionary containing annotations.
        - error_message: An error message if something goes wrong, otherwise None
    """
    # Fetch XML data
    local_file_path = os.path.join("test", f"{uniprot_id}.xml")
    if os.path.exists(local_file_path):
        with open(local_file_path, "r", encoding="utf-8") as file:
            response = file.read()
    else:
        # Fetch XML data from UniProt
        url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
        response = urlopen(url).read().decode("utf-8")

    # Parse XML with namespace
    root = ET.fromstring(response)
    ns = {"up": "http://uniprot.org/uniprot"}

    # Get sequence
    sequence_elem = root.find("./up:entry/up:sequence", ns)
    if sequence_elem is None:
        return None, None, "Could not find sequence in UniProt response"
    protein_sequence = sequence_elem.text.strip()

    # Get feature annotations
    annotations = {}
    for feature in root.findall(".//up:feature", ns):
        feature_type = feature.get("type")
        description = feature.get("description", "")

        # Get position information
        location = feature.find("up:location", ns)
        if location is None:
            continue

        # Handle different types of position elements
        position = location.find("up:position", ns)
        begin = location.find("up:begin", ns)
        end_elem = location.find("up:end", ns)

        if position is not None:
            pos = int(position.get("position"))
            # For single position features
            if feature_type not in annotations:
                annotations[feature_type] = []
            annotations[feature_type].append(
                {"position": pos, "description": description}
            )
        elif begin is not None and end_elem is not None:
            start = int(begin.get("position"))
            end = int(end_elem.get("position"))
            # For range features and disulfide bonds
            if feature_type not in annotations:
                annotations[feature_type] = []
            annotations[feature_type].append(
                {"begin": start, "end": end, "description": description}
            )

    return protein_sequence, annotations


def create_dataframe(protein_sequence, annotations):
    """
    Creates a Pandas DataFrame from protein sequence and annotations.
    """
    data = []
    for i, residue in enumerate(protein_sequence):
        residue_number = i + 1
        row = {
            "Residue Number": residue_number,
            "Residue code": residue,
            "Secondary structure": "",
            "Domain": "",
            "Pfam domain": "",
            "Disorder": "",
            "Disulfide bridges": "",
            "Glycosylation sites": "",
            "Phosphorylation sites": "",
            "active sites": "",
            "Binding sites": "",  # Combined binding sites column
            "modified": "",
        }
        data.append(row)

    df = pd.DataFrame(data)

    # Map UniProt feature types to our column names
    feature_mapping = {
        "strand": "Secondary structure",
        "helix": "Secondary structure",
        "turn": "Secondary structure",
        "domain": "Domain",
        "disulfide bond": "Disulfide bridges",
        "glycosylation site": "Glycosylation sites",
        "modified residue": "modified",
        "active site": "active sites",
        "site": "Phosphorylation sites",
    }

    # Special mappings that need additional processing
    region_mapping = {"pfam": "Pfam domain", "disorder": "Disorder"}

    for feature_type, values in annotations.items():
        feature_type = feature_type.lower()

        # Handle disulfide bond pairs
        if feature_type == "disulfide bond":
            for item in values:
                start = item["begin"]
                end = item["end"]
                desc = f"Cys-{end}"
                df.at[start - 1, "Disulfide bridges"] = desc
                desc = f"Cys-{start}"
                df.at[end - 1, "Disulfide bridges"] = desc

        # Handle glycosylation sites
        elif feature_type == "glycosylation site":
            for item in values:
                pos = item["position"] - 1
                df.at[pos, "Glycosylation sites"] = item["description"]

        # Handle region features
        elif feature_type == "region":
            for item in values:
                start = item.get("begin", item.get("position"))
                end = item.get("end", item.get("position"))
                if not start:
                    continue

                start = int(start)
                end = int(end) if end else start
                desc = item["description"].lower()

                # Map to appropriate column based on description
                column = None
                if "pfam" in desc:
                    column = "Pfam domain"
                elif "disorder" in desc:
                    column = "Disorder"

                if column:
                    for i in range(start - 1, end):
                        if i >= len(df):
                            continue
                        current = df.at[i, column]
                        if isinstance(current, str) and current != "" and desc:
                            df.at[i, column] = f"{current}; {desc}"
                        elif desc:
                            df.at[i, column] = desc

        # Handle binding site features
        elif feature_type == "binding site":
            for item in values:
                start = item.get("begin", item.get("position"))
                end = item.get("end", item.get("position"))
                if not start:
                    continue

                start = int(start)
                end = int(end) if end else start
                desc = item["description"]

                for i in range(start - 1, end):
                    if i >= len(df):
                        continue
                    current = df.at[i, "Binding sites"]
                    if isinstance(current, str) and current != "" and desc:
                        df.at[i, "Binding sites"] = f"{current}; {desc}"
                    elif desc:
                        df.at[i, "Binding sites"] = desc

        # Handle other features
        else:
            column = feature_mapping.get(feature_type)
            if not column:
                continue

            for item in values:
                start = item.get("begin", item.get("position"))
                end = item.get("end", item.get("position"))
                if not start:
                    continue

                start = int(start)
                end = int(end) if end else start

                for i in range(start - 1, end):
                    if i >= len(df):
                        continue
                    if column == "Secondary structure":
                        df.at[i, column] = feature_type.upper()
                    else:
                        current = df.at[i, column]
                        desc = item["description"]
                        if isinstance(current, str) and current != "" and desc:
                            df.at[i, column] = f"{current}; {desc}"
                        elif desc:
                            df.at[i, column] = desc

    return df