Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Dec 23, 2024

Commit

870813f

verified ·

1 Parent(s): 4e29ba7

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -128

app.py CHANGED Viewed

@@ -4,11 +4,6 @@ import joblib
 import numpy as np
 from itertools import product
 import torch.nn as nn
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
@@ -31,143 +26,88 @@ class VirusClassifier(nn.Module):
         return self.network(x)
 def sequence_to_kmer_vector(sequence: str, k: int = 6) -> np.ndarray:
-    """Convert sequence to k-mer frequency vector"""
-    try:
-        kmers = [''.join(p) for p in product("ACGT", repeat=k)]
-        kmer_dict = {kmer: 0 for kmer in kmers}
-        for i in range(len(sequence) - k + 1):
-            kmer = sequence[i:i+k]
-            if kmer in kmer_dict:  # only count valid kmers
-                kmer_dict[kmer] += 1
-        return np.array(list(kmer_dict.values()))
-    except Exception as e:
-        logger.error(f"Error in sequence_to_kmer_vector: {str(e)}")
-        raise
-def parse_fasta(content: str) -> list:
-    """Parse FASTA format from string content"""
-    try:
-        logger.info(f"Received file content length: {len(content)}")
-        sequences = []
-        current_header = None
-        current_sequence = []
-        for line in content.split('\n'):
-            line = line.strip()
-            if not line:
-                continue
-            if line.startswith('>'):
-                if current_header is not None:
-                    sequences.append((current_header, ''.join(current_sequence)))
-                current_header = line[1:]
-                current_sequence = []
-            else:
-                current_sequence.append(line.upper())
-        if current_header is not None:
-            sequences.append((current_header, ''.join(current_sequence)))
-        logger.info(f"Parsed {len(sequences)} sequences from FASTA")
-        return sequences
-    except Exception as e:
-        logger.error(f"Error parsing FASTA: {str(e)}")
-        raise
-def predict_sequence(fasta_file) -> str:
-    """Process FASTA input and return formatted predictions"""
-    try:
-        logger.info("Starting prediction process")
-        if fasta_file is None:
-            return "Please upload a FASTA file"
-        # Get file content - handle both string and file inputs
-        try:
-            if isinstance(fasta_file, str):
-                content = fasta_file
-            else:
-                content = fasta_file.name  # For Gradio file upload
-        except Exception as e:
-            logger.error(f"Error reading file: {str(e)}")
-            return f"Error reading file: {str(e)}"
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        logger.info(f"Using device: {device}")
-        k = 4
-        # Load model and scaler
-        try:
-            logger.info("Loading model and scaler")
-            model = VirusClassifier(256).to(device)  # 256 = 4^4 for 4-mers
-            model.load_state_dict(torch.load('model.pt', map_location=device))
-            scaler = joblib.load('scaler.pkl')
-            model.eval()
-        except Exception as e:
-            logger.error(f"Error loading model or scaler: {str(e)}")
-            return f"Error loading model: {str(e)}"
-        # Process sequences
-        try:
-            sequences = parse_fasta(content)
-        except Exception as e:
-            logger.error(f"Error parsing FASTA file: {str(e)}")
-            return f"Error parsing FASTA file: {str(e)}"
-        results = []
-        for header, seq in sequences:
-            logger.info(f"Processing sequence: {header}")
-            try:
-                # Convert sequence to k-mer vector
-                kmer_vector = sequence_to_kmer_vector(seq, k)
-                kmer_vector = scaler.transform(kmer_vector.reshape(1, -1))
-                # Get prediction
-                with torch.no_grad():
-                    output = model(torch.FloatTensor(kmer_vector).to(device))
-                    probs = torch.softmax(output, dim=1)
-                # Format result
-                pred_class = 1 if probs[0][1] > probs[0][0] else 0
-                pred_label = 'human' if pred_class == 1 else 'non-human'
-                result = f"""
 Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
 Human probability: {float(probs[0][1]):0.4f}
 Non-human probability: {float(probs[0][0]):0.4f}
 """
-                results.append(result)
-                logger.info(f"Processed sequence {header} successfully")
-            except Exception as e:
-                logger.error(f"Error processing sequence {header}: {str(e)}")
-                results.append(f"Error processing sequence {header}: {str(e)}")
-        return "\n".join(results)
-    except Exception as e:
-        logger.error(f"Unexpected error in predict_sequence: {str(e)}")
-        return f"An unexpected error occurred: {str(e)}"
-# Create Gradio interface with both file upload and text input
 iface = gr.Interface(
-    fn=predict_sequence,
-    inputs=gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"])
-    outputs=gr.Textbox(label="Prediction Results", lines=10),
-    title="Virus Host Classifier",
-    description="""Upload a FASTA file or paste your sequence to predict whether a virus sequence is likely to infect human or non-human hosts.
-Example format:
->sequence_name
-ATCGATCGATCG...""",
-    examples=[["example.fasta", None]],
-    cache_examples=True
 )
-# Launch the interface
 iface.launch()

 import numpy as np
 from itertools import product
 import torch.nn as nn
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         return self.network(x)
 def sequence_to_kmer_vector(sequence: str, k: int = 6) -> np.ndarray:
+    kmers = [''.join(p) for p in product("ACGT", repeat=k)]
+    kmer_dict = {kmer: 0 for kmer in kmers}
+    for i in range(len(sequence) - k + 1):
+        kmer = sequence[i:i+k]
+        if kmer in kmer_dict:
+            kmer_dict[kmer] += 1
+    return np.array(list(kmer_dict.values()))
+def parse_fasta(text):
+    sequences = []
+    current_header = None
+    current_sequence = []
+    for line in text.split('\n'):
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith('>'):
+            if current_header:
+                sequences.append((current_header, ''.join(current_sequence)))
+            current_header = line[1:]
+            current_sequence = []
+        else:
+            current_sequence.append(line.upper())
+    if current_header:
+        sequences.append((current_header, ''.join(current_sequence)))
+    return sequences
+def predict(file):
+    if not file:
+        return "Please upload a FASTA file"
+    # Read the file
+    text = file.decode('utf-8')
+    # Load model and scaler
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model = VirusClassifier(4096).to(device)
+    model.load_state_dict(torch.load('model.pt', map_location=device))
+    scaler = joblib.load('scaler.pkl')
+    model.eval()
+    # Get predictions
+    results = []
+    sequences = parse_fasta(text)
+    for header, seq in sequences:
+        # Get k-mer vector
+        kmer_vector = sequence_to_kmer_vector(seq)
+        kmer_vector = scaler.transform(kmer_vector.reshape(1, -1))
+        # Predict
+        with torch.no_grad():
+            output = model(torch.FloatTensor(kmer_vector).to(device))
+            probs = torch.softmax(output, dim=1)
+        # Format results
+        pred_class = 1 if probs[0][1] > probs[0][0] else 0
+        pred_label = 'human' if pred_class == 1 else 'non-human'
+        result = f"""
 Sequence: {header}
 Prediction: {pred_label}
 Confidence: {float(max(probs[0])):0.4f}
 Human probability: {float(probs[0][1]):0.4f}
 Non-human probability: {float(probs[0][0]):0.4f}
 """
+        results.append(result)
+    return "\n".join(results)
+# Create the interface
 iface = gr.Interface(
+    fn=predict,
+    inputs=gr.File(label="Upload FASTA file"),
+    outputs=gr.Textbox(label="Results"),
+    title="Virus Host Classifier"
 )
+# Launch
 iface.launch()