Spaces:

tonigi
/

sequencetable

Sleeping

App Files Files Community

tonigi commited on Mar 11, 2025

Commit

2b158eb

1 Parent(s): 1fb077a

fixes

Browse files

Files changed (4) hide show

README.txt +0 -43
app.py +54 -61
test/P01308.xml +0 -0
test/P07550.xml +0 -0

README.txt DELETED Viewed

@@ -1,43 +0,0 @@
-# Protein Sequence Table
-A Gradio-based web application that reformats protein sequences based on UniProt IDs and displays detailed annotations in a structured format.
-## Features
-The application retrieves protein data from UniProt and presents the following information for each residue:
-- Position number in the sequence
-- Amino acid (single-letter code)
-- Secondary structure annotation
-- Associated Pfam domain
-- Disorder prediction
-- Participation in disulfide bridges
-- Post-translational modifications:
-  * Glycosylation sites
-  * Phosphorylation sites
-- Functional annotations:
-  * Active sites
-  * Metal binding sites
-  * DNA binding regions
-  * RNA binding regions
-  * Ligand binding sites
-  * Other modifications
-## Usage
-1. Launch the application
-2. Enter a valid UniProt ID (e.g., P53_HUMAN) in the input field
-3. Click "Submit" to generate the analysis
-4. Results will be displayed in a interactive data frame format
-## Requirements
-- Python 3.7+
-- Gradio
-- Pandas
-- Requests
-- XML parsing libraries
-## Note
-The application processes UniProt's XML format to extract annotations.

app.py CHANGED Viewed

@@ -18,62 +18,58 @@ def get_uniprot_data(uniprot_id):
         - annotations: A dictionary containing annotations.
         - error_message: An error message if something goes wrong, otherwise None
     """
-    try:
-        # Fetch XML data
-        url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
-        response = urlopen(url).read().decode('utf-8')
-        # Parse XML with namespace
-        root = ET.fromstring(response)
-        ns = {'up': 'http://uniprot.org/uniprot'}
-        # Get sequence
-        sequence_elem = root.find(".//up:sequence", ns)
-        if sequence_elem is None:
-            return None, None, "Could not find sequence in UniProt response"
-        protein_sequence = sequence_elem.text.strip()
-        # Get feature annotations
-        annotations = {}
-        for feature in root.findall(".//up:feature", ns):
-            feature_type = feature.get('type')
-            description = feature.get('description', '')
-            # Get position information
-            location = feature.find("up:location", ns)
-            if location is None:
-                continue
-            # Handle different types of position elements
-            position = location.find("up:position", ns)
-            begin = location.find("up:begin", ns)
-            end_elem = location.find("up:end", ns)
-            if position is not None:
-                pos = int(position.get('position'))
-                # For single position features
-                if feature_type not in annotations:
-                    annotations[feature_type] = []
-                annotations[feature_type].append({
-                    'position': pos,
-                    'description': description
-                })
-            elif begin is not None and end_elem is not None:
-                start = int(begin.get('position'))
-                end = int(end_elem.get('position'))
-                # For range features and disulfide bonds
-                if feature_type not in annotations:
-                    annotations[feature_type] = []
-                annotations[feature_type].append({
-                    'begin': start,
-                    'end': end,
-                    'description': description
-                })
-        return protein_sequence, annotations, None
-    except Exception as e:
-        return None, None, f"Error fetching or processing data from UniProt: {e}"
 def create_dataframe(protein_sequence, annotations):
     """
@@ -127,9 +123,9 @@ def create_dataframe(protein_sequence, annotations):
             for item in values:
                 start = item['begin']
                 end = item['end']
-                desc = f"Disulfide bridge with Cys-{end}"
                 df.at[start-1, 'Disulfide bridges'] = desc
-                desc = f"Disulfide bridge with Cys-{start}"
                 df.at[end-1, 'Disulfide bridges'] = desc
         # Handle glycosylation sites
@@ -228,10 +224,7 @@ def process_uniprot_id(uniprot_id):
     Returns:
         A Pandas DataFrame or an error message.
     """
-    protein_sequence, annotations, error_message = get_uniprot_data(uniprot_id)
-    if error_message:
-        return error_message
     if protein_sequence and annotations:
         df = create_dataframe(protein_sequence, annotations)
@@ -255,17 +248,17 @@ with gr.Blocks() as demo:
         gr.Examples(
             examples=[
                 ["P06280"],  # Alpha-galactosidase A
-                ["P04637"],  # Tumor protein p53
                 ["P01308"],  # Insulin
                 ["Q8WZ42"],  # Titin
-                ["P04637"],  # p53 (alternate entry)
                 ["P0DTC2"],  # SARS-CoV-2 Spike protein
             ],
             inputs=input_text,
             label="Example UniProt IDs"
         )
-        output_df = gr.Dataframe()
         submit_btn.click(
             fn=process_uniprot_id,

         - annotations: A dictionary containing annotations.
         - error_message: An error message if something goes wrong, otherwise None
     """
+    # Fetch XML data
+    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
+    response = urlopen(url).read().decode('utf-8')
+    # Parse XML with namespace
+    root = ET.fromstring(response)
+    ns = {'up': 'http://uniprot.org/uniprot'}
+    # Get sequence
+    sequence_elem = root.find("./up:entry/up:sequence", ns)
+    if sequence_elem is None:
+        return None, None, "Could not find sequence in UniProt response"
+    protein_sequence = sequence_elem.text.strip()
+    # Get feature annotations
+    annotations = {}
+    for feature in root.findall(".//up:feature", ns):
+        feature_type = feature.get('type')
+        description = feature.get('description', '')
+        # Get position information
+        location = feature.find("up:location", ns)
+        if location is None:
+            continue
+        # Handle different types of position elements
+        position = location.find("up:position", ns)
+        begin = location.find("up:begin", ns)
+        end_elem = location.find("up:end", ns)
+        if position is not None:
+            pos = int(position.get('position'))
+            # For single position features
+            if feature_type not in annotations:
+                annotations[feature_type] = []
+            annotations[feature_type].append({
+                'position': pos,
+                'description': description
+            })
+        elif begin is not None and end_elem is not None:
+            start = int(begin.get('position'))
+            end = int(end_elem.get('position'))
+            # For range features and disulfide bonds
+            if feature_type not in annotations:
+                annotations[feature_type] = []
+            annotations[feature_type].append({
+                'begin': start,
+                'end': end,
+                'description': description
+            })
+    return protein_sequence, annotations
 def create_dataframe(protein_sequence, annotations):
     """
             for item in values:
                 start = item['begin']
                 end = item['end']
+                desc = f"Cys-{end}"
                 df.at[start-1, 'Disulfide bridges'] = desc
+                desc = f"Cys-{start}"
                 df.at[end-1, 'Disulfide bridges'] = desc
         # Handle glycosylation sites
     Returns:
         A Pandas DataFrame or an error message.
     """
+    protein_sequence, annotations = get_uniprot_data(uniprot_id)
     if protein_sequence and annotations:
         df = create_dataframe(protein_sequence, annotations)
         gr.Examples(
             examples=[
                 ["P06280"],  # Alpha-galactosidase A
+                ["P07550"],  # beta-2 AR
                 ["P01308"],  # Insulin
                 ["Q8WZ42"],  # Titin
                 ["P0DTC2"],  # SARS-CoV-2 Spike protein
             ],
+            example_labels=["Alpha-galactosidase A", "Beta-2 adrenergic receptor", "Insulin", "Titin", "SARS-CoV-2 Spike protein"],
             inputs=input_text,
             label="Example UniProt IDs"
         )
+        output_df = gr.Dataframe(interactive=False)
         submit_btn.click(
             fn=process_uniprot_id,

test/P01308.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

test/P07550.xml ADDED Viewed

The diff for this file is too large to render. See raw diff