Spaces:

hiyata
/

HostClassifier

Sleeping

App Files Files Community

hiyata commited on Jan 12, 2025

Commit

f1d4be6

verified ·

1 Parent(s): 8731787

Update app.py

Browse files

Files changed (1) hide show

app.py +315 -371

app.py CHANGED Viewed

@@ -2,13 +2,15 @@ import gradio as gr
 import torch
 import joblib
 import numpy as np
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import io
 from PIL import Image
-from itertools import product
-# --------------- Model Definition ---------------
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
@@ -29,46 +31,20 @@ class VirusClassifier(nn.Module):
     def forward(self, x):
         return self.network(x)
-    def get_gradient_importance(self, x, class_index=1):
-        """
-        Calculate gradient-based importance for each input feature.
-        By default, we compute the gradient wrt the 'human' class (index=1).
-        This method is akin to a raw gradient or 'saliency' approach.
-        """
-        x = x.clone().detach().requires_grad_(True)
-        output = self.network(x)
-        probs = torch.softmax(output, dim=1)
-        # Probability of the specified class
-        target_prob = probs[..., class_index]
-        # Zero existing gradients if any
-        if x.grad is not None:
-            x.grad.zero_()
-        # Backprop on that probability
-        target_prob.backward()
-        # Raw gradient is now in x.grad
-        importance = x.grad.detach()
-        # Optional: Multiply by input to get a more "integrated gradients"-like measure
-        # importance = importance * x.detach()
-        return importance, float(target_prob)
-# --------------- Utility Functions ---------------
-def parse_fasta(text: str):
     """
-    Parse a FASTA string and return a list of (header, sequence) pairs.
     """
     sequences = []
     current_header = None
     current_sequence = []
-    for line in text.split('\n'):
         line = line.strip()
         if not line:
             continue
@@ -85,10 +61,8 @@ def parse_fasta(text: str):
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """
-    Convert a nucleotide sequence into a k-mer frequency vector.
-    Defaults to k=4.
     """
-    # Generate all possible k-mers
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
@@ -104,385 +78,355 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     return vec
-def compute_sequence_stats(sequence: str):
-    """
-    Compute various statistics for a given sequence:
-      - Length
-      - GC content (%)
-      - A/C/G/T counts
     """
-    length = len(sequence)
-    if length == 0:
-        return {
-            'length': 0,
-            'gc_content': 0,
-            'counts': {'A': 0, 'C': 0, 'G': 0, 'T': 0}
-        }
-    counts = {
-        'A': sequence.count('A'),
-        'C': sequence.count('C'),
-        'G': sequence.count('G'),
-        'T': sequence.count('T')
-    }
-    gc_content = (counts['G'] + counts['C']) / length * 100.0
-    return {
-        'length': length,
-        'gc_content': gc_content,
-        'counts': counts
-    }
-# --------------- Visualization Functions ---------------
-def plot_shap_like_bars(kmers, importance_values, top_k=10):
     """
-    Create a bar chart that mimics a SHAP summary plot:
-      - k-mers on y-axis
-      - importance magnitude on x-axis
-      - color indicating positive (push towards human) vs negative (push towards non-human)
-    """
-    abs_importance = np.abs(importance_values)
-    # Sort by absolute importance
-    sorted_indices = np.argsort(abs_importance)[::-1]
-    top_indices = sorted_indices[:top_k]
-    # Prepare data
-    top_kmers = [kmers[i] for i in top_indices]
-    top_importances = importance_values[top_indices]
-    # Create plot
-    fig, ax = plt.subplots(figsize=(8, 6))
-    colors = ['green' if val > 0 else 'red' for val in top_importances]
-    ax.barh(range(len(top_kmers)), np.abs(top_importances), color=colors)
-    ax.set_yticks(range(len(top_kmers)))
-    ax.set_yticklabels(top_kmers)
-    ax.invert_yaxis()  # So that the highest value is at the top
-    ax.set_xlabel("Feature Importance (Gradient Magnitude)")
-    ax.set_title(f"Top-{top_k} SHAP-like Feature Importances")
-    plt.tight_layout()
-    return fig
-def plot_kmer_distribution(kmer_freq_vector, kmers):
-    """
-    Plot a histogram of k-mer frequencies for the entire vector.
-    (Optional if you want a quick distribution overview)
-    """
-    fig, ax = plt.subplots(figsize=(10, 4))
-    ax.bar(range(len(kmer_freq_vector)), kmer_freq_vector, color='blue', alpha=0.6)
-    ax.set_xlabel("K-mer Index")
-    ax.set_ylabel("Frequency")
-    ax.set_title("K-mer Frequency Distribution")
-    ax.set_xticks([])
-    plt.tight_layout()
-    return fig
-def create_step_visualization(important_kmers, human_prob):
     """
-    Re-implementation of your step-wise probability plot.
-    Shows how each top k-mer 'pushes' the probability from 0.5 to the final value.
     """
-    fig = plt.figure(figsize=(8, 5))
-    ax = fig.add_subplot(111)
-    # Start from 0.5
     current_prob = 0.5
     steps = [('Start', current_prob, 0)]
-    for kmer in important_kmers:
-        change = kmer['impact'] * (-1 if kmer['direction'] == 'non-human' else 1)
         current_prob += change
-        steps.append((kmer['kmer'], current_prob, change))
-    x_vals = range(len(steps))
-    y_vals = [s[1] for s in steps]
-    ax.step(x_vals, y_vals, 'b-', where='post', label='Probability', linewidth=2)
-    ax.plot(x_vals, y_vals, 'b.', markersize=10)
-    # Reference line at 0.5
-    ax.axhline(y=0.5, color='r', linestyle='--', label='Neutral (0.5)')
-    ax.set_ylim(0, 1)
-    ax.set_ylabel('Human Probability')
-    ax.set_title(f'K-mer Contributions (final p={human_prob:.3f})')
-    ax.grid(True, linestyle='--', alpha=0.7)
     for i, (kmer, prob, change) in enumerate(steps):
-        ax.annotate(kmer,
-                    (i, prob),
-                    xytext=(0, 10 if i % 2 == 0 else -20),
-                    textcoords='offset points',
-                    ha='center',
-                    rotation=45)
         if i > 0:
             change_text = f'{change:+.3f}'
             color = 'green' if change > 0 else 'red'
-            ax.annotate(change_text,
-                        (i, prob),
-                        xytext=(0, -20 if i % 2 == 0 else 10),
-                        textcoords='offset points',
-                        ha='center',
-                        color=color)
-    ax.legend()
-    plt.tight_layout()
-    return fig
-def plot_kmer_freq_and_sigma(important_kmers):
-    """
-    Plot frequencies vs. sigma from mean for the top k-mers.
-    This reuses logic from the original create_visualization second subplot,
-    but as its own function for clarity.
-    """
-    fig, ax = plt.subplots(figsize=(8, 5))
     # Prepare data
     kmers = [k['kmer'] for k in important_kmers]
     frequencies = [k['occurrence'] for k in important_kmers]
     sigmas = [k['sigma'] for k in important_kmers]
-    colors = ['green' if k['direction'] == 'human' else 'red' for k in important_kmers]
     x = np.arange(len(kmers))
     width = 0.35
-    # Frequency bars
-    ax.bar(x - width/2, frequencies, width, label='Frequency (%)', color=colors, alpha=0.6)
-    # Create a twin axis for sigma
-    ax2 = ax.twinx()
-    # Sigma bars
-    ax2.bar(x + width/2, sigmas, width, label='σ from mean',
-            color=[c if s > 0 else 'gray' for c, s in zip(colors, sigmas)], alpha=0.3)
-    ax.set_xticks(x)
-    ax.set_xticklabels(kmers, rotation=45)
-    ax.set_ylabel('Frequency (%)')
-    ax2.set_ylabel('Standard Deviations (σ) from Mean')
-    ax.set_title("K-mer Frequencies & Statistical Significance")
-    lines1, labels1 = ax.get_legend_handles_labels()
-    lines2, labels2 = ax2.get_legend_handles_labels()
-    ax.legend(lines1 + lines2, labels1 + labels2, loc='best')
     plt.tight_layout()
     return fig
-# --------------- Main Prediction Logic ---------------
-def predict_fasta(
-    file_obj,
-    k_size=4,
-    top_k=10,
-    advanced_analysis=False
-):
     """
-    Main function to predict classes for each sequence in an uploaded FASTA.
-    Returns:
-      - Combined textual report for all sequences
-      - A list of generated PIL Image plots
     """
-    # 1. Read raw text from file or string
-    if file_obj is None:
-        return "Please upload a FASTA file", []
-    try:
-        if isinstance(file_obj, str):
-            text = file_obj
-        else:
-            text = file_obj.decode('utf-8', errors='replace')
-    except Exception as e:
-        return f"Error reading file: {str(e)}", []
-    # 2. Parse the FASTA
-    sequences = parse_fasta(text)
-    if not sequences:
-        return "No valid FASTA sequences found!", []
-    # 3. Load model & scaler
     try:
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        model = VirusClassifier(input_shape=(4 ** k_size)).to(device)
         state_dict = torch.load('model.pt', map_location=device)
         model.load_state_dict(state_dict)
-        model.eval()
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
-        return f"Error loading model/scaler: {str(e)}", []
-    # 4. Prepare k-mer dictionary for reference
-    all_kmers = [''.join(p) for p in product("ACGT", repeat=k_size)]
-    kmer_dict = {km: i for i, km in enumerate(all_kmers)}
-    # 5. Iterate over sequences and build output
-    final_text_report = []
-    plots = []
-    for idx, (header, seq) in enumerate(sequences, start=1):
-        seq_stats = compute_sequence_stats(seq)
-        # Convert sequence -> raw freq -> scaled freq
-        raw_kmer_freq = sequence_to_kmer_vector(seq, k=k_size)
-        scaled_kmer_freq = scaler.transform(raw_kmer_freq.reshape(1, -1))
-        X_tensor = torch.FloatTensor(scaled_kmer_freq).to(device)
-        # Predict
-        with torch.no_grad():
-            output = model(X_tensor)
-            probs = torch.softmax(output, dim=1)
-        # Determine class
-        pred_class = torch.argmax(probs, dim=1).item()
-        pred_label = 'human' if pred_class == 1 else 'non-human'
-        human_prob = float(probs[0][1])
-        non_human_prob = float(probs[0][0])
-        confidence = float(torch.max(probs[0]).item())
-        # Compute gradient-based importance
-        importance, target_prob = model.get_gradient_importance(X_tensor, class_index=1)
-        importance = importance[0].cpu().numpy()  # shape: (num_features,)
-        # Identify top-k features (by absolute gradient)
-        abs_importance = np.abs(importance)
-        sorted_indices = np.argsort(abs_importance)[::-1]
-        top_indices = sorted_indices[:top_k]
-        # Build a list of top k-mers
-        top_kmers_info = []
-        for i in top_indices:
-            kmer_name = all_kmers[i]
-            imp_val = float(importance[i])
-            direction = 'human' if imp_val > 0 else 'non-human'
-            freq_perc = float(raw_kmer_freq[i] * 100.0)  # in percent
-            sigma = float(scaled_kmer_freq[0][i])  # This is the scaled value (stdev from mean if the scaler is StandardScaler)
-            top_kmers_info.append({
-                'kmer': kmer_name,
-                'impact': abs(imp_val),
-                'direction': direction,
-                'occurrence': freq_perc,
-                'sigma': sigma
-            })
-        # Text summary for this sequence
-        seq_report = []
-        seq_report.append(f"=== Sequence {idx} ===")
-        seq_report.append(f"Header: {header}")
-        seq_report.append(f"Length: {seq_stats['length']}")
-        seq_report.append(f"GC Content: {seq_stats['gc_content']:.2f}%")
-        seq_report.append(f"A: {seq_stats['counts']['A']}, C: {seq_stats['counts']['C']}, G: {seq_stats['counts']['G']}, T: {seq_stats['counts']['T']}")
-        seq_report.append(f"Prediction: {pred_label} (Confidence: {confidence:.4f})")
-        seq_report.append(f"  Human Probability: {human_prob:.4f}")
-        seq_report.append(f"  Non-human Probability: {non_human_prob:.4f}")
-        seq_report.append(f"\nTop-{top_k} Influential k-mers (by gradient magnitude):")
-        for tkm in top_kmers_info:
-            seq_report.append(
-                f"  {tkm['kmer']}: pushes towards {tkm['direction']} "
-                f"(impact={tkm['impact']:.4f}), occurrence={tkm['occurrence']:.2f}%, "
-                f"sigma={tkm['sigma']:.2f}"
-            )
-        final_text_report.append("\n".join(seq_report))
-        # 6. Generate Plots (for each sequence)
-        if advanced_analysis:
-            # 6A. SHAP-like bar chart
-            fig_shap = plot_shap_like_bars(
-                kmers=all_kmers,
-                importance_values=importance,
-                top_k=top_k
-            )
-            buf_shap = io.BytesIO()
-            fig_shap.savefig(buf_shap, format='png', bbox_inches='tight', dpi=150)
-            buf_shap.seek(0)
-            plots.append(Image.open(buf_shap))
-            plt.close(fig_shap)
-            # 6B. k-mer distribution histogram
-            fig_kmer_dist = plot_kmer_distribution(raw_kmer_freq, all_kmers)
-            buf_dist = io.BytesIO()
-            fig_kmer_dist.savefig(buf_dist, format='png', bbox_inches='tight', dpi=150)
-            buf_dist.seek(0)
-            plots.append(Image.open(buf_dist))
-            plt.close(fig_kmer_dist)
-        # 6C. Original step visualization for top k k-mers
-        # Sort by actual 'impact' to preserve that step logic
-        # (largest absolute impact first)
-        top_kmers_info_step = sorted(top_kmers_info, key=lambda x: x['impact'], reverse=True)
-        fig_step = create_step_visualization(top_kmers_info_step, human_prob)
-        buf_step = io.BytesIO()
-        fig_step.savefig(buf_step, format='png', bbox_inches='tight', dpi=150)
-        buf_step.seek(0)
-        plots.append(Image.open(buf_step))
-        plt.close(fig_step)
-        # 6D. Frequency vs. sigma bar chart
-        fig_freq_sigma = plot_kmer_freq_and_sigma(top_kmers_info_step)
-        buf_freq_sigma = io.BytesIO()
-        fig_freq_sigma.savefig(buf_freq_sigma, format='png', bbox_inches='tight', dpi=150)
-        buf_freq_sigma.seek(0)
-        plots.append(Image.open(buf_freq_sigma))
-        plt.close(fig_freq_sigma)
-    # Combine all text results
-    combined_text = "\n\n".join(final_text_report)
-    return combined_text, plots
-# --------------- Gradio Interface ---------------
-def run_prediction(
-    file_obj,
-    k_size,
-    top_k,
-    advanced_analysis
-):
-    """
-    Wrapper for Gradio to handle the outputs in (text, List[Image]) form.
-    """
-    text_output, pil_images = predict_fasta(
-        file_obj=file_obj,
-        k_size=k_size,
-        top_k=top_k,
-        advanced_analysis=advanced_analysis
-    )
-    return text_output, pil_images
-with gr.Blocks() as demo:
-    gr.Markdown("# Virus Host Classifier (Improved!)")
-    gr.Markdown(
-        "Upload a FASTA file and configure k-mer size, number of top features, "
-        "and whether to run advanced analysis (plots of SHAP-like bars & k-mer distribution)."
-    )
-    with gr.Row():
-        with gr.Column():
-            fasta_file = gr.File(label="Upload FASTA file", type="binary")
-            kmer_slider = gr.Slider(minimum=2, maximum=6, value=4, step=1, label="K-mer Size")
-            topk_slider = gr.Slider(minimum=5, maximum=20, value=10, step=1, label="Top-k Features")
-            advanced_check = gr.Checkbox(value=False, label="Advanced Analysis")
-            predict_button = gr.Button("Predict")
-        with gr.Column():
-            results_text = gr.Textbox(
-                label="Results",
-                lines=20,
-                placeholder="Prediction results will appear here..."
-            )
-    # We can display multiple images in a Gallery or as separate outputs.
-    plots_gallery = gr.Gallery(label="Analysis Plots", columns=2)
-    predict_button.click(
-        fn=run_prediction,
-        inputs=[fasta_file, kmer_slider, topk_slider, advanced_check],
-        outputs=[results_text, plots_gallery]
-    )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import torch
 import joblib
 import numpy as np
+from itertools import product
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import io
 from PIL import Image
+##############################################################################
+# MODEL DEFINITION
+##############################################################################
 class VirusClassifier(nn.Module):
     def __init__(self, input_shape: int):
     def forward(self, x):
         return self.network(x)
+##############################################################################
+# UTILITIES
+##############################################################################
+def parse_fasta(text):
     """
+    Parses FASTA formatted text into a list of (header, sequence).
     """
     sequences = []
     current_header = None
     current_sequence = []
+    for line in text.strip().split('\n'):
         line = line.strip()
         if not line:
             continue
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     """
+    Convert a sequence to a k-mer frequency vector of size len(ACGT^k).
     """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
     return vec
+def ablation_importance(model, x_tensor):
     """
+    Calculates a simple ablation-based importance measure for each feature:
+    1. Compute baseline human probability p_base.
+    2. For each feature i, set x[i] = 0, re-run inference, compute new p, and
+       measure delta = p_base - p.
+    3. Return array of deltas (positive means that removing that feature
+       *decreases* the probability => that feature was pushing it higher).
     """
+    model.eval()
+    with torch.no_grad():
+        # Baseline probability
+        output = model(x_tensor)
+        probs = torch.softmax(output, dim=1)
+        p_base = probs[0, 1].item()
+    # Store the delta importances
+    importances = np.zeros(x_tensor.shape[1], dtype=np.float32)
+    # For efficiency, we do ablation one feature at a time
+    for i in range(x_tensor.shape[1]):
+        x_copy = x_tensor.clone()
+        x_copy[0, i] = 0.0  # Ablate this feature
+        with torch.no_grad():
+            output_ablation = model(x_copy)
+            probs_ablation = torch.softmax(output_ablation, dim=1)
+            p_ablation = probs_ablation[0, 1].item()
+        # Delta
+        importances[i] = p_base - p_ablation
+    return importances, p_base
+##############################################################################
+# PLOTTING
+##############################################################################
+def create_step_and_frequency_plot(important_kmers, human_prob, title):
     """
+    Creates a combined step plot (showing how each k-mer modifies the probability)
+    and a frequency vs. sigma bar chart.
     """
+    fig = plt.figure(figsize=(15, 10))
+    # Create grid for subplots
+    gs = plt.GridSpec(2, 1, height_ratios=[1.5, 1], hspace=0.3)
+    # 1. Probability Step Plot
+    ax1 = plt.subplot(gs[0])
     current_prob = 0.5
     steps = [('Start', current_prob, 0)]
+    for kmer_info in important_kmers:
+        change = kmer_info['impact']  # positive => pushes up, negative => pushes down
         current_prob += change
+        steps.append((kmer_info['kmer'], current_prob, change))
+    x = range(len(steps))
+    y = [step[1] for step in steps]
+    # Plot steps
+    ax1.step(x, y, 'b-', where='post', label='Probability', linewidth=2)
+    ax1.plot(x, y, 'b.', markersize=10)
+    # Add reference line
+    ax1.axhline(y=0.5, color='r', linestyle='--', label='Neutral (0.5)')
+    # Customize plot
+    ax1.grid(True, linestyle='--', alpha=0.7)
+    ax1.set_ylim(0, 1)
+    ax1.set_ylabel('Human Probability')
+    ax1.set_title(f'K-mer Contributions to Prediction (final prob: {human_prob:.3f})')
+    # Add labels for each point
     for i, (kmer, prob, change) in enumerate(steps):
+        # Add k-mer label
+        ax1.annotate(kmer,
+                     (i, prob),
+                     xytext=(0, 10 if i % 2 == 0 else -20),
+                     textcoords='offset points',
+                     ha='center',
+                     rotation=45)
+        # Add change value
         if i > 0:
             change_text = f'{change:+.3f}'
             color = 'green' if change > 0 else 'red'
+            ax1.annotate(change_text,
+                         (i, prob),
+                         xytext=(0, -20 if i % 2 == 0 else 10),
+                         textcoords='offset points',
+                         ha='center',
+                         color=color)
+    ax1.legend()
+    # 2. K-mer Frequency and Sigma Plot
+    ax2 = plt.subplot(gs[1])
     # Prepare data
     kmers = [k['kmer'] for k in important_kmers]
     frequencies = [k['occurrence'] for k in important_kmers]
     sigmas = [k['sigma'] for k in important_kmers]
+    # Color the bars: if impact>0 => green, else red
+    colors = ['g' if k['impact'] > 0 else 'r' for k in important_kmers]
+    # Create bar plot for frequencies
     x = np.arange(len(kmers))
     width = 0.35
+    ax2.bar(x - width/2, frequencies, width, label='Frequency (%)', color=colors, alpha=0.6)
+    # Twin axis for sigma
+    ax2_twin = ax2.twinx()
+    # To highlight positive or negative sigma, pick color accordingly
+    sigma_colors = []
+    for s, c in zip(sigmas, colors):
+        if s >= 0:
+            sigma_colors.append('blue')  # above average
+        else:
+            sigma_colors.append('gray')  # below average
+    ax2_twin.bar(x + width/2, sigmas, width, label='σ from Mean', color=sigma_colors, alpha=0.3)
+    # Customize plot
+    ax2.set_xticks(x)
+    ax2.set_xticklabels(kmers, rotation=45)
+    ax2.set_ylabel('Frequency (%)')
+    ax2_twin.set_ylabel('Standard Deviations (σ) from Mean')
+    ax2.set_title('K-mer Frequencies and Statistical Significance')
+    # Add legends
+    lines1, labels1 = ax2.get_legend_handles_labels()
+    lines2, labels2 = ax2_twin.get_legend_handles_labels()
+    ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
     plt.tight_layout()
     return fig
+def create_shap_like_bar_plot(impact_values, kmer_list, top_k):
     """
+    Creates a horizontal bar plot showing the top_k features by absolute impact.
+    impact_values: array of float (length=256).
+    kmer_list: list of all k=4 kmers in order.
+    top_k: integer, how many top features to display.
     """
+    # Sort by absolute impact
+    indices_sorted = np.argsort(np.abs(impact_values))[::-1]
+    top_indices = indices_sorted[:top_k]
+    top_impacts = impact_values[top_indices]
+    top_kmers = [kmer_list[i] for i in top_indices]
+    fig = plt.figure(figsize=(8, 6))
+    plt.barh(range(len(top_impacts)), top_impacts, color=['green' if i > 0 else 'red' for i in top_impacts])
+    plt.yticks(range(len(top_impacts)), top_kmers)
+    plt.xlabel("Impact on Human Probability (Ablation)")
+    plt.title(f"Top {top_k} K-mers by Absolute Impact")
+    plt.gca().invert_yaxis()  # Highest at top
+    plt.tight_layout()
+    return fig
+def create_global_bar_plot(impact_values, kmer_list):
+    """
+    Creates a bar plot for ALL features (256) to see the global distribution.
+    """
+    fig = plt.figure(figsize=(12, 6))
+    indices_sorted = np.argsort(np.abs(impact_values))[::-1]
+    sorted_impacts = impact_values[indices_sorted]
+    sorted_kmers = [kmer_list[i] for i in indices_sorted]
+    plt.bar(range(len(sorted_impacts)), sorted_impacts,
+            color=['green' if i > 0 else 'red' for i in sorted_impacts])
+    plt.title("Global Impact of All 256 K-mers (Ablation Method)")
+    plt.xlabel("K-mer (sorted by |impact|)")
+    plt.ylabel("Impact on Human Probability")
+    # Optionally, we can skip labeling all 256 on x-axis.
+    # But we can show only the top/bottom or none for clarity.
+    plt.tight_layout()
+    return fig
+##############################################################################
+# MAIN PREDICTION FUNCTION
+##############################################################################
+def predict(file_obj, top_kmers=10, advanced_plots=False, fasta_text=""):
+    """
+    Main prediction function called by Gradio.
+    - file_obj: optional uploaded FASTA file
+    - top_kmers: number of top k-mers to display in the main SHAP-like plot
+    - advanced_plots: bool, whether to return global bar plots
+    - fasta_text: optional direct-pasted FASTA text
+    """
+    # Priority: If user pasted text, use that; otherwise use uploaded file.
+    if fasta_text.strip():
+        text = fasta_text.strip()
+    else:
+        if file_obj is None:
+            return "No FASTA input provided", None, None, None
+        try:
+            if isinstance(file_obj, str):
+                text = file_obj
+            else:
+                text = file_obj.decode('utf-8')
+        except Exception as e:
+            return f"Error reading file: {str(e)}", None, None, None
+    # Parse FASTA
+    sequences = parse_fasta(text)
+    if len(sequences) == 0:
+        return "No valid FASTA sequences found", None, None, None
+    header, seq = sequences[0]
+    # Load model + scaler
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model = VirusClassifier(256).to(device)
     try:
         state_dict = torch.load('model.pt', map_location=device)
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return f"Error loading model or scaler: {str(e)}", None, None, None
+    # Prepare the vector
+    raw_freq_vector = sequence_to_kmer_vector(seq, k=4)
+    scaled_vector = scaler.transform(raw_freq_vector.reshape(1, -1))
+    X_tensor = torch.FloatTensor(scaled_vector).to(device)
+    # Compute ablation-based importances
+    importances, p_base = ablation_importance(model, X_tensor)
+    # p_base is baseline human probability
+    # We also want frequency in % and sigma from mean
+    # If your scaler is e.g. StandardScaler, then "scaled_vector[0][i]" is
+    # how many std devs from the mean that feature is.
+    # We'll gather info in a list of dicts for each k-mer.
+    kmers_4 = [''.join(p) for p in product("ACGT", repeat=4)]
+    kmer_dict = {km: i for i, km in enumerate(kmers_4)}
+    # We'll sort by absolute impact to get the top 10 by default.
+    abs_sorted_idx = np.argsort(np.abs(importances))[::-1]
+    # But for the final step/frequency plot we only show top_kmers
+    top_indices = abs_sorted_idx[:top_kmers]
+    # Build a list of the top k-mers
+    important_kmers = []
+    for idx in top_indices:
+        # "impact" is how much that feature changed the probability
+        impact = importances[idx]
+        # raw frequency => raw_freq_vector[idx] * 100 for %
+        freq_pct = float(raw_freq_vector[idx] * 100.0)
+        # sigma => scaled_vector[0][idx]
+        sigma_val = float(scaled_vector[0][idx])
+        important_kmers.append({
+            'kmer': kmers_4[idx],
+            'impact': impact,
+            'occurrence': freq_pct,
+            'sigma': sigma_val
+        })
+    # For text output
+    # We decide final class based on model's direct output
+    with torch.no_grad():
+        output = model(X_tensor)
+        probs = torch.softmax(output, dim=1)
+    pred_class = 1 if probs[0,1] > probs[0,0] else 0
+    pred_label = 'human' if pred_class == 1 else 'non-human'
+    human_prob = probs[0,1].item()
+    nonhuman_prob = probs[0,0].item()
+    confidence = max(human_prob, nonhuman_prob)
+    results_text = (f"Sequence: {header}\n"
+                    f"Prediction: {pred_label}\n"
+                    f"Confidence: {confidence:.4f}\n"
+                    f"Human probability: {human_prob:.4f}\n"
+                    f"Non-human probability: {nonhuman_prob:.4f}\n"
+                    f"Most influential k-mers (by ablation impact):\n")
+    for kmer_info in important_kmers:
+        # sign => if impact>0 => removing it lowers p(human), so it was pushing p(human) up
+        direction = "UP (toward human)" if kmer_info['impact'] > 0 else "DOWN (toward non-human)"
+        results_text += (
+            f"  {kmer_info['kmer']}: {direction}, "
+            f"Impact={kmer_info['impact']:.4f}, "
+            f"Occ={kmer_info['occurrence']:.2f}% of seq, "
+            f"{abs(kmer_info['sigma']):.2f}σ "
+            + ("above" if kmer_info['sigma']>0 else "below")
+            + " mean\n"
+        )
+    # PLOT 1: A SHAP-like bar plot for the top K features
+    shap_fig = create_shap_like_bar_plot(importances, kmers_4, top_kmers)
+    # PLOT 2: Step + frequency plot for the top K features
+    step_fig = create_step_and_frequency_plot(important_kmers, human_prob, header)
+    # PLOT 3 (optional advanced): global bar plot of all 256 features
+    global_fig = None
+    if advanced_plots:
+        global_fig = create_global_bar_plot(importances, kmers_4)
+    # Convert figures to PIL Images
+    def fig_to_image(fig):
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight', dpi=200)
+        buf.seek(0)
+        im = Image.open(buf)
+        plt.close(fig)
+        return im
+    shap_img = fig_to_image(shap_fig)
+    step_img = fig_to_image(step_fig)
+    if global_fig is not None:
+        global_img = fig_to_image(global_fig)
+    else:
+        global_img = None
+    return results_text, shap_img, step_img, global_img
+##############################################################################
+# GRADIO INTERFACE
+##############################################################################
+title_text = "Virus Host Classifier"
+description_text = """
+Upload or paste a FASTA sequence to predict if it's likely **human** or **non-human** origin.
+- **k=4** k-mers are used as features.
+- We display ablation-based feature importance for interpretability.
+- Advanced plots can be toggled to see the global distribution of all 256 k-mer impacts.
+"""
+iface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.File(label="Upload FASTA file", type="binary", optional=True),
+        gr.Slider(label="Number of top k-mers to show", minimum=1, maximum=50, value=10, step=1),
+        gr.Checkbox(label="Show advanced (global) plots?", value=False),
+        gr.Textbox(label="Or paste FASTA text here", lines=5, placeholder=">header\nACGTACGT...")
+    ],
+    outputs=[
+        gr.Textbox(label="Results", lines=10),
+        gr.Image(label="SHAP-like Top-k K-mer Bar Plot"),
+        gr.Image(label="Step & Frequency Plot (Top-k)"),
+        gr.Image(label="Global 256-K-mer Plot (advanced)", optional=True)
+    ],
+    title=title_text,
+    description=description_text
+)
 if __name__ == "__main__":
+    iface.launch(share=True)