Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12, 2025

Commit

962ae70

verified ·

1 Parent(s): 7aea9ac

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -35

app.py CHANGED Viewed

@@ -8,6 +8,10 @@ import matplotlib.pyplot as plt
 import io
 from PIL import Image
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         super(VirusClassifier, self).__init__()
@@ -28,6 +32,11 @@ class VirusClassifier(nn.Module):
     def forward(self, x):
         return self.network(x)
 def parse_fasta(text):
     """Parse FASTA formatted text into a list of (header, sequence)."""
     sequences = []
@@ -66,6 +75,11 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     return vec
 def calculate_shap_values(model, x_tensor):
     """
     Calculate SHAP values using a simple ablation approach.
@@ -76,22 +90,88 @@ def calculate_shap_values(model, x_tensor):
         # Get baseline prediction
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
-        baseline_prob = baseline_probs[0, 1].item()  # Probability of human class
         # Calculate impact of zeroing each feature
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
-            x_zeroed[0, i] = 0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
             prob = probs[0, 1].item()
-            impact = baseline_prob - prob  # How much removing the feature changed the prediction
             shap_values.append(impact)
-            x_zeroed[0, i] = x_tensor[0, i]  # Restore the original value
     return np.array(shap_values), baseline_prob
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
@@ -108,7 +188,7 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
-    plt.gca().invert_yaxis()  # Most important at top
     return plt.gcf()
@@ -147,16 +227,14 @@ def visualize_sequence_impacts(sequence, kmers, shap_values, base_prob):
     # Plot k-mers with controlled spacing
     y_spacing = 0.9 / max(len(display_kmers), 1)
     y_position = 0.95
-    max_seq_display = 100  # Maximum sequence length to show
     for pos, kmer, impact in display_kmers:
-        # Truncate sequence display if too long
         pre_sequence = sequence[max(0, pos-20):pos]
-        post_sequence = sequence[pos+k:min(pos+k+20, len(sequence))]
         # Add ellipsis if truncated
         pre_ellipsis = "..." if pos > 20 else ""
-        post_ellipsis = "..." if pos+k+20 < len(sequence) else ""
         # Choose color based on impact
         color = '#ffcccb' if impact > 0 else '#cce0ff'
@@ -165,9 +243,9 @@ def visualize_sequence_impacts(sequence, kmers, shap_values, base_prob):
         # Draw text elements
         plt.text(0.01, y_position, f"{pre_ellipsis}{pre_sequence}", fontsize=9)
         plt.text(0.01 + len(f"{pre_ellipsis}{pre_sequence}")/50, y_position,
-                kmer, fontsize=9, bbox=dict(facecolor=color, alpha=0.3, pad=1))
         plt.text(0.01 + (len(f"{pre_ellipsis}{pre_sequence}") + len(kmer))/50,
-                y_position, f"{post_sequence}{post_ellipsis}", fontsize=9)
         # Add impact value
         plt.text(0.8, y_position, f"{arrow} {impact:+.3f}", fontsize=9)
@@ -176,10 +254,29 @@ def visualize_sequence_impacts(sequence, kmers, shap_values, base_prob):
     plt.axis('off')
-    # Adjust layout with specific margins
     plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05)
     return fig
 def predict(file_obj, top_kmers=10, fasta_text=""):
     """Main prediction function for Gradio interface."""
     # Handle input
@@ -190,25 +287,26 @@ def predict(file_obj, top_kmers=10, fasta_text=""):
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
-            return f"Error reading file: {str(e)}", None, None
     else:
-        return "Please provide a FASTA sequence.", None, None
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
-        return "No valid FASTA sequences found.", None, None
     header, seq = sequences[0]
-    # Load model and process sequence
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         model = VirusClassifier(256).to(device)
-        model.load_state_dict(torch.load('model.pt', map_location=device, weights_only=True))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
-        return f"Error loading model: {str(e)}", None, None
     # Generate features
     freq_vector = sequence_to_kmer_vector(seq)
@@ -218,34 +316,38 @@ def predict(file_obj, top_kmers=10, fasta_text=""):
     # Calculate SHAP values and get prediction
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
-    # Generate result text
     results = [
         f"Sequence: {header}",
         f"Prediction: {'Human' if prob_human > 0.5 else 'Non-human'} Origin",
-        f"Confidence: {max(prob_human, 1-prob_human):.3f}",
         f"Human Probability: {prob_human:.3f}",
         "\nTop Contributing k-mers:"
     ]
-    # Get k-mers for visualization
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
-    # Create visualizations
     importance_plot = create_importance_bar_plot(shap_values, kmers, top_kmers)
     sequence_plot = visualize_sequence_impacts(seq, kmers, shap_values, prob_human)
-    # Convert plots to images
-    def fig_to_image(fig):
-        buf = io.BytesIO()
-        fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
-        buf.seek(0)
-        img = Image.open(buf)
-        plt.close(fig)
-        return img
-    return "\n".join(results), fig_to_image(importance_plot), fig_to_image(sequence_plot)
-# Create Gradio interface
 css = """
 .gradio-container {
     font-family: 'IBM Plex Sans', sans-serif;
@@ -283,11 +385,12 @@ with gr.Blocks(css=css) as iface:
             results = gr.Textbox(label="Analysis Results", lines=10)
             kmer_plot = gr.Image(label="K-mer Importance Plot")
             shap_plot = gr.Image(label="Sequence Impact Visualization (SHAP-style)")
     submit_btn.click(
         predict,
         inputs=[file_input, top_k, text_input],
-        outputs=[results, kmer_plot, shap_plot]
     )
     gr.Markdown("""
@@ -298,7 +401,10 @@ with gr.Blocks(css=css) as iface:
       - Blue highlights = pushing toward non-human origin
       - Arrows (↑/↓) show impact direction
       - Values show impact magnitude
     """)
 if __name__ == "__main__":
-    iface.launch()

 import io
 from PIL import Image
+###############################################################################
+# 1. MODEL DEFINITION
+###############################################################################
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
         super(VirusClassifier, self).__init__()
     def forward(self, x):
         return self.network(x)
+###############################################################################
+# 2. FASTA PARSING & K-MER FEATURE ENGINEERING
+###############################################################################
 def parse_fasta(text):
     """Parse FASTA formatted text into a list of (header, sequence)."""
     sequences = []
     return vec
+###############################################################################
+# 3. SHAP-VALUE (ABLATION) CALCULATION
+###############################################################################
 def calculate_shap_values(model, x_tensor):
     """
     Calculate SHAP values using a simple ablation approach.
         # Get baseline prediction
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
+        baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human' class
         # Calculate impact of zeroing each feature
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
+            orig_value = x_zeroed[0, i].item()
+            x_zeroed[0, i] = 0.0
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
             prob = probs[0, 1].item()
+            impact = baseline_prob - prob  # how much removing the feature changed the prediction
             shap_values.append(impact)
+            x_zeroed[0, i] = orig_value  # restore the original value
     return np.array(shap_values), baseline_prob
+###############################################################################
+# 4. PER-BASE SHAP AGGREGATION (LINEAR HEATMAP)
+###############################################################################
+def compute_positionwise_scores(sequence, shap_values, k=4):
+    """
+    Returns an array of per-base SHAP contributions by averaging
+    the k-mer SHAP values of all k-mers covering that base.
+    """
+    # Create the list of k-mers (in lexicographic order)
+    kmers = [''.join(p) for p in product("ACGT", repeat=k)]
+    kmer_dict = {km: i for i, km in enumerate(kmers)}
+    seq_len = len(sequence)
+    # Arrays to accumulate sums (SHAP) and coverage counts
+    shap_sums = np.zeros(seq_len, dtype=np.float32)
+    coverage = np.zeros(seq_len, dtype=np.float32)
+    # Slide over the sequence, summing SHAP values for overlapping positions
+    for i in range(seq_len - k + 1):
+        kmer = sequence[i:i+k]
+        if kmer in kmer_dict:
+            # Get the SHAP value for this k-mer
+            value = shap_values[kmer_dict[kmer]]
+            # Accumulate it for each base in the k-mer
+            shap_sums[i : i + k] += value
+            coverage[i : i + k] += 1
+    # Compute the average SHAP per base (avoid divide-by-zero)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
+    return shap_means
+def plot_linear_heatmap(shap_means):
+    """
+    Plots a 1D heatmap of per-base SHAP contributions.
+    Negative = push toward Non-Human, Positive = push toward Human.
+    """
+    # Reshape into (1, -1) so that imshow displays it as a single row
+    heatmap_data = shap_means.reshape(1, -1)
+    fig, ax = plt.subplots(figsize=(12, 2))
+    # We'll use a diverging color map (red/blue)
+    cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
+    # Add colorbar
+    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.2)
+    cbar.set_label('SHAP Contribution')
+    ax.set_yticks([])  # single row, so hide the y-axis
+    ax.set_xlabel('Position in Sequence')
+    ax.set_title('Per-base SHAP Heatmap')
+    plt.tight_layout()
+    return fig
+###############################################################################
+# 5. OTHER PLOTS: BAR PLOT OF TOP-K AND SEQUENCE IMPACT VISUALIZATION
+###############################################################################
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP value (impact on model output)')
     plt.title(f'Top {top_k} Most Influential k-mers')
+    plt.gca().invert_yaxis()  # most important at top
     return plt.gcf()
     # Plot k-mers with controlled spacing
     y_spacing = 0.9 / max(len(display_kmers), 1)
     y_position = 0.95
     for pos, kmer, impact in display_kmers:
         pre_sequence = sequence[max(0, pos-20):pos]
+        post_sequence = sequence[pos+len(kmer):min(pos+len(kmer)+20, len(sequence))]
         # Add ellipsis if truncated
         pre_ellipsis = "..." if pos > 20 else ""
+        post_ellipsis = "..." if pos+len(kmer)+20 < len(sequence) else ""
         # Choose color based on impact
         color = '#ffcccb' if impact > 0 else '#cce0ff'
         # Draw text elements
         plt.text(0.01, y_position, f"{pre_ellipsis}{pre_sequence}", fontsize=9)
         plt.text(0.01 + len(f"{pre_ellipsis}{pre_sequence}")/50, y_position,
+                 kmer, fontsize=9, bbox=dict(facecolor=color, alpha=0.3, pad=1))
         plt.text(0.01 + (len(f"{pre_ellipsis}{pre_sequence}") + len(kmer))/50,
+                 y_position, f"{post_sequence}{post_ellipsis}", fontsize=9)
         # Add impact value
         plt.text(0.8, y_position, f"{arrow} {impact:+.3f}", fontsize=9)
     plt.axis('off')
+    # Adjust layout
     plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05)
     return fig
+###############################################################################
+# 6. HELPER FUNCTION: FIG TO IMAGE
+###############################################################################
+def fig_to_image(fig):
+    """Convert a Matplotlib figure to a PIL Image."""
+    buf = io.BytesIO()
+    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
+    buf.seek(0)
+    img = Image.open(buf)
+    plt.close(fig)
+    return img
+###############################################################################
+# 7. MAIN PREDICTION FUNCTION
+###############################################################################
 def predict(file_obj, top_kmers=10, fasta_text=""):
     """Main prediction function for Gradio interface."""
     # Handle input
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
+            return f"Error reading file: {str(e)}", None, None, None
     else:
+        return "Please provide a FASTA sequence.", None, None, None
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
+        return "No valid FASTA sequences found.", None, None, None
     header, seq = sequences[0]
+    # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         model = VirusClassifier(256).to(device)
+        # Remove 'weights_only=True' if it causes errors; it's not a standard argument.
+        model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return f"Error loading model: {str(e)}", None, None, None
     # Generate features
     freq_vector = sequence_to_kmer_vector(seq)
     # Calculate SHAP values and get prediction
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
+    # Prediction text
     results = [
         f"Sequence: {header}",
         f"Prediction: {'Human' if prob_human > 0.5 else 'Non-human'} Origin",
+        f"Confidence: {max(prob_human, 1 - prob_human):.3f}",
         f"Human Probability: {prob_human:.3f}",
         "\nTop Contributing k-mers:"
     ]
+    # Create k-mer lists for visualization
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+    # 1) K-mer importance bar plot
     importance_plot = create_importance_bar_plot(shap_values, kmers, top_kmers)
+    importance_img = fig_to_image(importance_plot)
+    # 2) SHAP-style textual sequence impact
     sequence_plot = visualize_sequence_impacts(seq, kmers, shap_values, prob_human)
+    sequence_img = fig_to_image(sequence_plot)
+    # 3) Linear heatmap across full genome
+    shap_means = compute_positionwise_scores(seq, shap_values, k=4)
+    heatmap_fig = plot_linear_heatmap(shap_means)
+    heatmap_img = fig_to_image(heatmap_fig)
+    return "\n".join(results), importance_img, sequence_img, heatmap_img
+###############################################################################
+# 8. BUILD GRADIO INTERFACE
+###############################################################################
 css = """
 .gradio-container {
     font-family: 'IBM Plex Sans', sans-serif;
             results = gr.Textbox(label="Analysis Results", lines=10)
             kmer_plot = gr.Image(label="K-mer Importance Plot")
             shap_plot = gr.Image(label="Sequence Impact Visualization (SHAP-style)")
+            heatmap_plot = gr.Image(label="Genome Heatmap")
     submit_btn.click(
         predict,
         inputs=[file_input, top_k, text_input],
+        outputs=[results, kmer_plot, shap_plot, heatmap_plot]
     )
     gr.Markdown("""
       - Blue highlights = pushing toward non-human origin
       - Arrows (↑/↓) show impact direction
       - Values show impact magnitude
+    - **Genome Heatmap**: Per-base SHAP values across the entire sequence
+      - Red = push toward human
+      - Blue = push toward non-human
     """)
 if __name__ == "__main__":
+    iface.launch()