Spaces:

hiyata
/

HostClassifier

Sleeping

App Files Files Community

hiyata commited on Jan 12, 2025

Commit

7e92f7c

verified ·

1 Parent(s): 555d484

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -129

app.py CHANGED Viewed

@@ -29,9 +29,7 @@ class VirusClassifier(nn.Module):
         return self.network(x)
 def parse_fasta(text):
-    """
-    Parses FASTA formatted text into a list of (header, sequence).
-    """
     sequences = []
     current_header = None
     current_sequence = []
@@ -52,9 +50,7 @@ def parse_fasta(text):
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
-    """
-    Convert a sequence to a k-mer frequency vector.
-    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
@@ -72,130 +68,130 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
 def calculate_shap_values(model, x_tensor):
     """
-    Calculate SHAP-like values using a simple ablation approach.
     """
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
-        baseline_prob = torch.softmax(baseline_output, dim=1)[0, 1].item()
         shap_values = []
         for i in range(x_tensor.shape[1]):
-            perturbed_input = x_tensor.clone()
-            perturbed_input[0, i] = 0  # Ablate feature
-            output = model(perturbed_input)
-            prob = torch.softmax(output, dim=1)[0, 1].item()
-            shap_values.append(baseline_prob - prob)
     return np.array(shap_values), baseline_prob
-def create_importance_plot(shap_values, kmers, top_k=10):
-    """
-    Create horizontal bar plot of feature importance.
-    """
-    # Set style directly instead of using seaborn
-    plt.rcParams['figure.facecolor'] = '#ffffff'
-    plt.rcParams['axes.facecolor'] = '#ffffff'
-    plt.rcParams['axes.grid'] = True
-    plt.rcParams['grid.alpha'] = 0.3
-    fig = plt.figure(figsize=(10, 8))
     # Sort by absolute importance
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
-    colors = ['#2ecc71' if v > 0 else '#e74c3c' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
-    plt.xlabel('Impact on Prediction (SHAP value)')
     plt.title(f'Top {top_k} Most Influential k-mers')
-    plt.gca().invert_yaxis()
-    return fig
-def create_contribution_plot(important_kmers, final_prob):
     """
-    Create waterfall plot showing cumulative feature contributions.
     """
-    # Set style parameters
-    plt.rcParams['figure.facecolor'] = '#ffffff'
-    plt.rcParams['axes.facecolor'] = '#ffffff'
-    plt.rcParams['axes.grid'] = True
-    plt.rcParams['grid.alpha'] = 0.3
-    fig, ax = plt.subplots(figsize=(12, 6))
-    base_prob = 0.5
-    cumulative = [base_prob]
-    labels = ['Base']
-    for kmer_info in important_kmers:
-        cumulative.append(cumulative[-1] + kmer_info['impact'])
-        labels.append(kmer_info['kmer'])
-    # Plot cumulative line with markers
-    line = ax.plot(range(len(cumulative)), cumulative, '-o',
-                  color='#3498db', linewidth=2,
-                  marker='o', markersize=8,
-                  markerfacecolor='white',
-                  markeredgecolor='#3498db',
-                  markeredgewidth=2)
-    # Add reference line at 0.5
-    ax.axhline(y=0.5, color='#95a5a6', linestyle='--', alpha=0.5)
-    # Customize plot
-    ax.set_xticks(range(len(labels)))
-    ax.set_xticklabels(labels, rotation=45, ha='right')
-    ax.set_ylim(0, 1)
-    ax.grid(True, axis='y', linestyle='--', alpha=0.3)
-    ax.set_title('Cumulative Feature Contributions')
-    ax.set_ylabel('Probability of Human Origin')
-    # Add value labels
-    for i, prob in enumerate(cumulative):
-        ax.annotate(f'{prob:.3f}',
-                   (i, prob),
-                   xytext=(0, 10),
-                   textcoords='offset points',
-                   ha='center',
-                   va='bottom')
-    # Adjust layout to prevent label cutoff
     plt.tight_layout()
     return fig
 def predict(file_obj, top_kmers=10, fasta_text=""):
-    """
-    Main prediction function for the Gradio interface.
-    """
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
         try:
-            # File input will be a filepath since we specified type="filepath"
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
-            return f"Error reading file: {str(e)}\nPlease ensure you're uploading a valid FASTA text file.", None, None
     else:
-        return "Please provide a FASTA sequence either by file upload or text input.", None, None
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
-        return "No valid FASTA sequences found in input.", None, None
     header, seq = sequences[0]
-    # Process sequence
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         model = VirusClassifier(256).to(device)
-        # Load model weights safely
         model.load_state_dict(torch.load('model.pt', map_location=device, weights_only=True))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
@@ -206,42 +202,24 @@ def predict(file_obj, top_kmers=10, fasta_text=""):
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
-    # Calculate SHAP values and predictions
-    shap_values, human_prob = calculate_shap_values(model, x_tensor)
-    # Generate k-mer information
-    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
-    important_indices = np.argsort(np.abs(shap_values))[-top_kmers:]
-    important_kmers = []
-    for idx in important_indices:
-        important_kmers.append({
-            'kmer': kmers[idx],
-            'impact': shap_values[idx],
-            'frequency': freq_vector[idx] * 100,
-            'significance': scaled_vector[0][idx]
-        })
-    # Format results text
     results = [
         f"Sequence: {header}",
-        f"Prediction: {'Human' if human_prob > 0.5 else 'Non-human'} Origin",
-        f"Confidence: {max(human_prob, 1-human_prob):.3f}",
-        f"Human Probability: {human_prob:.3f}",
-        "\nTop Contributing k-mers:",
     ]
-    for kmer in important_kmers:
-        direction = "→ Human" if kmer['impact'] > 0 else "→ Non-human"
-        results.append(
-            f"• {kmer['kmer']}: {direction} "
-            f"(impact: {kmer['impact']:.3f}, "
-            f"freq: {kmer['frequency']:.2f}%)"
-        )
-    # Generate plots
-    shap_plot = create_importance_plot(shap_values, kmers, top_kmers)
-    contribution_plot = create_contribution_plot(important_kmers, human_prob)
     # Convert plots to images
     def fig_to_image(fig):
@@ -252,30 +230,19 @@ def predict(file_obj, top_kmers=10, fasta_text=""):
         plt.close(fig)
         return img
-    return "\n".join(results), fig_to_image(shap_plot), fig_to_image(contribution_plot)
 # Create Gradio interface
 css = """
 .gradio-container {
     font-family: 'IBM Plex Sans', sans-serif;
 }
-.interpretation-container {
-    margin-top: 20px;
-    padding: 15px;
-    border-radius: 8px;
-    background-color: #f8f9fa;
-}
 """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
     # Virus Host Classifier
-    This tool predicts whether a viral sequence is likely of human or non-human origin using k-mer frequency analysis.
-    ### Instructions
-    1. Upload a FASTA file or paste your sequence in FASTA format
-    2. Adjust the number of top k-mers to display (default: 10)
-    3. View the prediction results and feature importance visualizations
     """)
     with gr.Row():
@@ -283,7 +250,7 @@ with gr.Blocks(css=css) as iface:
             file_input = gr.File(
                 label="Upload FASTA file",
                 file_types=[".fasta", ".fa", ".txt"],
-                type="filepath"  # Changed to filepath which is one of the valid options
             )
             text_input = gr.Textbox(
                 label="Or paste FASTA sequence",
@@ -292,7 +259,7 @@ with gr.Blocks(css=css) as iface:
             )
             top_k = gr.Slider(
                 minimum=5,
-                maximum=20,
                 value=10,
                 step=1,
                 label="Number of top k-mers to display"
@@ -301,20 +268,23 @@ with gr.Blocks(css=css) as iface:
         with gr.Column(scale=2):
             results = gr.Textbox(label="Analysis Results", lines=10)
-            shap_plot = gr.Image(label="Feature Importance Plot")
-            contribution_plot = gr.Image(label="Cumulative Contribution Plot")
     submit_btn.click(
         predict,
         inputs=[file_input, top_k, text_input],
-        outputs=[results, shap_plot, contribution_plot]
     )
     gr.Markdown("""
-    ### About
-    - Uses 4-mer frequencies as sequence features
-    - Employs SHAP-like values for feature importance interpretation
-    - Visualizes cumulative feature contributions to the final prediction
     """)
 if __name__ == "__main__":

         return self.network(x)
 def parse_fasta(text):
+    """Parse FASTA formatted text into a list of (header, sequence)."""
     sequences = []
     current_header = None
     current_sequence = []
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
+    """Convert a sequence to a k-mer frequency vector."""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
 def calculate_shap_values(model, x_tensor):
     """
+    Calculate SHAP values using a simple ablation approach.
+    Returns shap values and model prediction.
     """
     model.eval()
     with torch.no_grad():
+        # Get baseline prediction
         baseline_output = model(x_tensor)
+        baseline_probs = torch.softmax(baseline_output, dim=1)
+        baseline_prob = baseline_probs[0, 1].item()  # Probability of human class
+        # Calculate impact of zeroing each feature
         shap_values = []
+        x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
+            x_zeroed[0, i] = 0
+            output = model(x_zeroed)
+            probs = torch.softmax(output, dim=1)
+            prob = probs[0, 1].item()
+            impact = baseline_prob - prob  # How much removing the feature changed the prediction
+            shap_values.append(impact)
+            x_zeroed[0, i] = x_tensor[0, i]  # Restore the original value
     return np.array(shap_values), baseline_prob
+def create_importance_bar_plot(shap_values, kmers, top_k=10):
+    """Create a bar plot of the most important k-mers."""
+    plt.rcParams.update({'font.size': 10})
+    plt.figure(figsize=(10, 6))
     # Sort by absolute importance
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
+    colors = ['#ff9999' if v > 0 else '#99ccff' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
+    plt.xlabel('SHAP value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
+    plt.gca().invert_yaxis()  # Most important at top
+    return plt.gcf()
+def visualize_sequence_impacts(sequence, kmers, shap_values, base_prob):
     """
+    Create a SHAP-style visualization of sequence impacts.
+    Shows each k-mer's contribution in context.
     """
+    k = 4  # k-mer size
+    kmer_dict = {km: i for i, km in enumerate(kmers)}
+    # Find all k-mers and their impacts
+    kmer_impacts = []
+    for i in range(len(sequence) - k + 1):
+        kmer = sequence[i:i+k]
+        if kmer in kmer_dict:
+            impact = shap_values[kmer_dict[kmer]]
+            kmer_impacts.append((i, kmer, impact))
+    # Sort by absolute impact
+    kmer_impacts.sort(key=lambda x: abs(x[2]), reverse=True)
+    # Create the plot
+    fig = plt.figure(figsize=(20, max(10, len(kmer_impacts[:30])*0.3)))
+    ax = plt.gca()
+    # Add title and base value
+    plt.text(0.01, 1.02, f"base value = {base_prob:.3f}", transform=ax.transAxes, fontsize=12)
+    # Plot k-mers
+    y_position = 1
+    sequence_length = len(sequence)
+    for pos, kmer, impact in kmer_impacts[:30]:  # Show top 30 most impactful k-mers
+        # Show sequence with highlighted k-mer
+        pre_sequence = sequence[:pos]
+        post_sequence = sequence[pos+k:]
+        # Choose color based on impact
+        color = '#ffcccb' if impact > 0 else '#cce0ff'  # Light red or light blue
+        arrow = '↑' if impact > 0 else '↓'
+        # Calculate text positions
+        plt.text(0.01, y_position, pre_sequence, fontsize=10)
+        plt.text(0.01 + len(pre_sequence)/(sequence_length*1.5), y_position,
+                kmer, fontsize=10, bbox=dict(facecolor=color, alpha=0.3, pad=2))
+        plt.text(0.01 + (len(pre_sequence) + len(kmer))/(sequence_length*1.5),
+                y_position, post_sequence, fontsize=10)
+        # Add impact value
+        plt.text(0.8, y_position, f"{arrow} {impact:+.3f}", fontsize=10)
+        y_position -= 0.03
+    plt.axis('off')
     plt.tight_layout()
     return fig
 def predict(file_obj, top_kmers=10, fasta_text=""):
+    """Main prediction function for Gradio interface."""
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
         try:
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
+            return f"Error reading file: {str(e)}", None, None
     else:
+        return "Please provide a FASTA sequence.", None, None
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
+        return "No valid FASTA sequences found.", None, None
     header, seq = sequences[0]
+    # Load model and process sequence
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         model = VirusClassifier(256).to(device)
         model.load_state_dict(torch.load('model.pt', map_location=device, weights_only=True))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
+    # Calculate SHAP values and get prediction
+    shap_values, prob_human = calculate_shap_values(model, x_tensor)
+    # Generate result text
     results = [
         f"Sequence: {header}",
+        f"Prediction: {'Human' if prob_human > 0.5 else 'Non-human'} Origin",
+        f"Confidence: {max(prob_human, 1-prob_human):.3f}",
+        f"Human Probability: {prob_human:.3f}",
+        "\nTop Contributing k-mers:"
     ]
+    # Get k-mers for visualization
+    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+    # Create visualizations
+    importance_plot = create_importance_bar_plot(shap_values, kmers, top_kmers)
+    sequence_plot = visualize_sequence_impacts(seq, kmers, shap_values, prob_human)
     # Convert plots to images
     def fig_to_image(fig):
         plt.close(fig)
         return img
+    return "\n".join(results), fig_to_image(importance_plot), fig_to_image(sequence_plot)
 # Create Gradio interface
 css = """
 .gradio-container {
     font-family: 'IBM Plex Sans', sans-serif;
 }
 """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
     # Virus Host Classifier
+    Predicts whether a viral sequence is of human or non-human origin using k-mer analysis.
     """)
     with gr.Row():
             file_input = gr.File(
                 label="Upload FASTA file",
                 file_types=[".fasta", ".fa", ".txt"],
+                type="filepath"
             )
             text_input = gr.Textbox(
                 label="Or paste FASTA sequence",
             )
             top_k = gr.Slider(
                 minimum=5,
+                maximum=30,
                 value=10,
                 step=1,
                 label="Number of top k-mers to display"
         with gr.Column(scale=2):
             results = gr.Textbox(label="Analysis Results", lines=10)
+            kmer_plot = gr.Image(label="K-mer Importance Plot")
+            shap_plot = gr.Image(label="Sequence Impact Visualization (SHAP-style)")
     submit_btn.click(
         predict,
         inputs=[file_input, top_k, text_input],
+        outputs=[results, kmer_plot, shap_plot]
     )
     gr.Markdown("""
+    ### Visualization Guide
+    - **K-mer Importance Plot**: Shows the most influential k-mers and their SHAP values
+    - **Sequence Impact Visualization**: Shows the sequence with highlighted k-mers:
+      - Red highlights = pushing toward human origin
+      - Blue highlights = pushing toward non-human origin
+      - Arrows (↑/↓) show impact direction
+      - Values show impact magnitude
     """)
 if __name__ == "__main__":