Spaces:

arikat
/

Glydentify

Running

App Files Files Community

arikat commited on Oct 11, 2023

Commit

94f42df

1 Parent(s): 06f6357

new pre-processing function

Browse files

Files changed (1) hide show

app.py +15 -32

app.py CHANGED Viewed

@@ -201,20 +201,27 @@ def fig_to_img(fig):
     img = Image.open(buf)
     return img
-def process_family_sequence(protein_fasta):
     lines = protein_fasta.split('\n')
     headers = [line for line in lines if line.startswith('>')]
     if len(headers) > 1:
-        return None, None, None, "Multiple fasta sequences detected. Please upload a fasta file with multiple sequences, otherwise only include one fasta sequence."
     protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
     # Check for invalid characters
     valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")  # the 20 standard amino acids
     if not set(protein_sequence).issubset(valid_characters):
-        return None, None, None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?"
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
     input_idsfam = encoded_input["input_ids"]
@@ -263,20 +270,7 @@ def process_family_sequence(protein_fasta):
 def process_single_sequence(protein_fasta): #, protein_file
-    lines = protein_fasta.split('\n')
-    headers = [line for line in lines if line.startswith('>')]
-    if len(headers) > 1:
-        return None, "Multiple fasta sequences detected. Please upload a fasta file with multiple sequences, otherwise only include one fasta sequence.", None
-    protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
-    # Check for invalid characters
-    valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")  # the 20 standard amino acids
-    if not set(protein_sequence).issubset(valid_characters):
-        return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?", None
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
     input_ids = encoded_input["input_ids"]
@@ -360,18 +354,7 @@ def mask_residue(sequence, position):
     return sequence[:position] + 'X' + sequence[position+1:]
 def generate_heatmap(protein_fasta):
-    lines = protein_fasta.strip().split('\n')
-    header = lines[0]
-    protein_sequence = ''.join(lines[1:])
-    # Check if the header is valid
-    if not header.startswith('>'):
-        return None, "Invalid FASTA format. Header should start with '>'.", None
-    # Check for invalid characters in the sequence
-    valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")
-    if not set(protein_sequence).issubset(valid_characters):
-        return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids.", None
     # Tokenize and predict for original sequence
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")

     img = Image.open(buf)
     return img
+def preprocess_protein_sequence(protein_fasta):
     lines = protein_fasta.split('\n')
     headers = [line for line in lines if line.startswith('>')]
     if len(headers) > 1:
+        return None, "Multiple fasta sequences detected. Please upload a fasta file with only one sequence."
     protein_sequence = ''.join(line for line in lines if not line.startswith('>'))
     # Check for invalid characters
     valid_characters = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")  # the 20 standard amino acids
     if not set(protein_sequence).issubset(valid_characters):
+        return None, "Invalid protein sequence. It contains characters that are not one of the 20 standard amino acids. Does your sequence contain gaps?"
+    return protein_sequence, None
+def process_family_sequence(protein_fasta):
+    protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
+    if error_msg:
+        return None, None, None, error_msg
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
     input_idsfam = encoded_input["input_ids"]
 def process_single_sequence(protein_fasta): #, protein_file
+    protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")
     input_ids = encoded_input["input_ids"]
     return sequence[:position] + 'X' + sequence[position+1:]
 def generate_heatmap(protein_fasta):
+    protein_sequence, error_msg = preprocess_protein_sequence(protein_fasta)
     # Tokenize and predict for original sequence
     encoded_input = tokenizer([protein_sequence], padding=True, truncation=True, max_length=512, return_tensors="pt")