Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12, 2025

Commit

f5ea8d6

verified ·

1 Parent(s): 455bf4d

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -273

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import torch
 import numpy as np
 from itertools import product
 import torch.nn as nn
@@ -7,7 +8,6 @@ import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 import io
 from PIL import Image
-from scipy.interpolate import interp1d
 ###############################################################################
 # 1. MODEL DEFINITION
@@ -71,7 +71,7 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
-        vec /= total_kmers
     return vec
@@ -86,10 +86,12 @@ def calculate_shap_values(model, x_tensor):
     """
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
-        baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human'
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
@@ -100,7 +102,7 @@ def calculate_shap_values(model, x_tensor):
             prob = probs[0, 1].item()
             impact = baseline_prob - prob
             shap_values.append(impact)
-            x_zeroed[0, i] = original_val
     return np.array(shap_values), baseline_prob
 ###############################################################################
@@ -108,6 +110,10 @@ def calculate_shap_values(model, x_tensor):
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
@@ -132,13 +138,20 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
     if window_size >= n:
         avg_val = float(np.mean(shap_means))
         return (0, n, avg_val)
     csum = np.zeros(n + 1, dtype=np.float32)
     csum[1:] = np.cumsum(shap_means)
@@ -165,6 +178,7 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
 ###############################################################################
 def fig_to_image(fig):
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
@@ -173,14 +187,27 @@ def fig_to_image(fig):
     return img
 def get_zero_centered_cmap():
     colors = [
-        (0.0, 'blue'),
-        (0.5, 'white'),
-        (1.0, 'red')
     ]
-    return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
@@ -191,46 +218,73 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     if len(local_shap) == 0:
         local_shap = np.array([0.0])
     heatmap_data = local_shap.reshape(1, -1)
     min_val = np.min(local_shap)
     max_val = np.max(local_shap)
     extent = max(abs(min_val), abs(max_val))
-    cmap = get_zero_centered_cmap()
-    fig, ax = plt.subplots(figsize=(12, 1.8))
     cax = ax.imshow(
         heatmap_data,
         aspect='auto',
-        cmap=cmap,
         vmin=-extent,
-        vmax=extent
     )
     cbar = plt.colorbar(
         cax,
         orientation='horizontal',
-        pad=0.25,
-        aspect=40,
-        shrink=0.8
     )
-    cbar.ax.tick_params(labelsize=8)
-    cbar.set_label('SHAP Contribution', fontsize=9, labelpad=5)
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence', fontsize=10)
     ax.set_title(f"{title}{subtitle}", pad=10)
-    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
     colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP Value (impact on model output)')
@@ -240,6 +294,9 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
@@ -251,102 +308,119 @@ def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     return fig
 def compute_gc_content(sequence):
     if not sequence:
         return 0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
-# 7. SEQUENCE ANALYSIS FUNCTIONS
 ###############################################################################
-# Set up device and load the model once globally
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = VirusClassifier(256)
-model.load_state_dict(torch.load("model.pt", map_location=device))
-model.to(device)
-model.eval()
-KMERS_4 = [''.join(p) for p in product("ACGT", repeat=4)]
-def analyze_sequence(file_path, top_k=10, fasta_text="", window_size=500):
     """
-    Analyze a virus sequence from a FASTA file or text input.
-    Returns (results_text, kmer_plot, heatmap_plot, state_dict, header)
     """
     try:
-        if file_path:
-            with open(file_path, 'r') as f:
-                fasta_text = f.read()
-        if not fasta_text.strip():
-            return ("Error: No sequence provided", None, None, {}, "")
-        sequences = parse_fasta(fasta_text)
-        if not sequences:
-            return ("Error: No valid FASTA sequences found", None, None, {}, "")
-        header, sequence = sequences[0]
-        x = sequence_to_kmer_vector(sequence, k=4)
-        x_tensor = torch.tensor(x).float().unsqueeze(0).to(device)
-        with torch.no_grad():
-            output = model(x_tensor)
-            probs = torch.softmax(output, dim=1)
-            pred_human = probs[0, 1].item()
-        classification = "Human" if pred_human > 0.5 else "Non-human"
-        shap_values, baseline_prob = calculate_shap_values(model, x_tensor)
-        shap_means = compute_positionwise_scores(sequence, shap_values, k=4)
-        start_max, end_max, avg_max = find_extreme_subregion(shap_means, window_size, mode="max")
-        start_min, end_min, avg_min = find_extreme_subregion(shap_means, window_size, mode="min")
-        results = (
-            f"Classification: {classification} "
-            f"(probability of human = {pred_human:.3f})\n\n"
-            f"Sequence length: {len(sequence):,} bases\n"
-            f"Overall GC content: {compute_gc_content(sequence):.1f}%\n\n"
-            f"Most human-like {window_size} bp region:\n"
-            f"Position {start_max:,} to {end_max:,}\n"
-            f"Average SHAP: {avg_max:.4f}\n"
-            f"GC content: {compute_gc_content(sequence[start_max:end_max]):.1f}%\n\n"
-            f"Least human-like {window_size} bp region:\n"
-            f"Position {start_min:,} to {end_min:,}\n"
-            f"Average SHAP: {avg_min:.4f}\n"
-            f"GC content: {compute_gc_content(sequence[start_min:end_min]):.1f}%"
-        )
-        kmer_fig = create_importance_bar_plot(shap_values, KMERS_4, top_k=top_k)
-        kmer_img = fig_to_image(kmer_fig)
-        heatmap_fig = plot_linear_heatmap(shap_means)
-        heatmap_img = fig_to_image(heatmap_fig)
-        state = {
-            "seq": sequence,
-            "shap_means": shap_means
-        }
-        return results, kmer_img, heatmap_img, state, header
     except Exception as e:
-        return (f"Error analyzing sequence: {str(e)}", None, None, {}, "")
 ###############################################################################
-# 8. SUBREGION ANALYSIS FUNCTION
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
     region_start = int(region_start)
     region_end = int(region_end)
@@ -355,15 +429,19 @@ def analyze_subregion(state, header, region_start, region_end):
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
@@ -381,6 +459,7 @@ def analyze_subregion(state, header, region_start, region_end):
         f"Subregion interpretation: {region_classification}\n"
     )
     heatmap_fig = plot_linear_heatmap(
         shap_means,
         title="Subregion SHAP",
@@ -389,122 +468,15 @@ def analyze_subregion(state, header, region_start, region_end):
     )
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
     return (region_info, heatmap_img, hist_img)
-###############################################################################
-# 9. COMPARISON ANALYSIS FUNCTIONS
-###############################################################################
-def normalize_shap_lengths(shap1, shap2, num_points=1000):
-    x1 = np.linspace(0, 1, len(shap1))
-    x2 = np.linspace(0, 1, len(shap2))
-    f1 = interp1d(x1, shap1, kind='linear')
-    f2 = interp1d(x2, shap2, kind='linear')
-    x_new = np.linspace(0, 1, num_points)
-    shap1_norm = f1(x_new)
-    shap2_norm = f2(x_new)
-    return shap1_norm, shap2_norm
-def compute_shap_difference(shap1_norm, shap2_norm):
-    return shap2_norm - shap1_norm
-def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
-    heatmap_data = shap_diff.reshape(1, -1)
-    extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
-    cmap = get_zero_centered_cmap()
-    fig, ax = plt.subplots(figsize=(12, 1.8))
-    cax = ax.imshow(
-        heatmap_data,
-        aspect='auto',
-        cmap=cmap,
-        vmin=-extent,
-        vmax=extent
-    )
-    cbar = plt.colorbar(
-        cax,
-        orientation='horizontal',
-        pad=0.25,
-        aspect=40,
-        shrink=0.8
-    )
-    cbar.ax.tick_params(labelsize=8)
-    cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
-    ax.set_yticks([])
-    ax.set_xlabel('Normalized Position (0-100%)', fontsize=10)
-    ax.set_title(title, pad=10)
-    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
-    return fig
-def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
-    results1 = analyze_sequence(file1, top_k=10, fasta_text=fasta1, window_size=500)
-    if isinstance(results1[0], str) and "Error" in results1[0]:
-        return (f"Error in sequence 1: {results1[0]}", None, None)
-    results2 = analyze_sequence(file2, top_k=10, fasta_text=fasta2, window_size=500)
-    if isinstance(results2[0], str) and "Error" in results2[0]:
-        return (f"Error in sequence 2: {results2[0]}", None, None)
-    shap1 = results1[3]["shap_means"]
-    shap2 = results2[3]["shap_means"]
-    shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
-    shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
-    avg_diff = np.mean(shap_diff)
-    std_diff = np.std(shap_diff)
-    max_diff = np.max(shap_diff)
-    min_diff = np.min(shap_diff)
-    threshold = 0.05
-    substantial_diffs = np.abs(shap_diff) > threshold
-    frac_different = np.mean(substantial_diffs)
-    classification1 = results1[0].split('Classification: ')[1].split('\n')[0].strip()
-    classification2 = results2[0].split('Classification: ')[1].split('\n')[0].strip()
-    len1_formatted = "{:,}".format(len(shap1))
-    len2_formatted = "{:,}".format(len(shap2))
-    frac_formatted = "{:.2%}".format(frac_different)
-    comparison_text = (
-        "Sequence Comparison Results:\n"
-        f"Sequence 1: {results1[4]}\n"
-        f"Length: {len1_formatted} bases\n"
-        f"Classification: {classification1}\n\n"
-        f"Sequence 2: {results2[4]}\n"
-        f"Length: {len2_formatted} bases\n"
-        f"Classification: {classification2}\n\n"
-        "Comparison Statistics:\n"
-        f"Average SHAP difference: {avg_diff:.4f}\n"
-        f"Standard deviation: {std_diff:.4f}\n"
-        f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
-        f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
-        f"Fraction of positions with substantial differences: {frac_formatted}\n\n"
-        "Interpretation:\n"
-        "Positive values (red) indicate regions where Sequence 2 is more 'human-like'\n"
-        "Negative values (blue) indicate regions where Sequence 1 is more 'human-like'"
-    )
-    heatmap_fig = plot_comparative_heatmap(shap_diff)
-    heatmap_img = fig_to_image(heatmap_fig)
-    hist_fig = plot_shap_histogram(shap_diff, title="Distribution of SHAP Differences")
-    hist_img = fig_to_image(hist_fig)
-    return comparison_text, heatmap_img, hist_img
 ###############################################################################
-# 10. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
@@ -535,14 +507,14 @@ with gr.Blocks(css=css) as iface:
                     placeholder=">sequence_name\nACGTACGT...",
                     lines=5
                 )
-                top_k_slider = gr.Slider(
                     minimum=5,
                     maximum=30,
                     value=10,
                     step=1,
                     label="Number of top k-mers to display"
                 )
-                win_size_slider = gr.Slider(
                     minimum=100,
                     maximum=5000,
                     value=500,
@@ -561,9 +533,10 @@ with gr.Blocks(css=css) as iface:
         seq_state = gr.State()
         header_state = gr.State()
         analyze_btn.click(
             analyze_sequence,
-            inputs=[file_input, top_k_slider, text_input, win_size_slider],
             outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
         )
@@ -592,61 +565,6 @@ with gr.Blocks(css=css) as iface:
             inputs=[seq_state, header_state, region_start, region_end],
             outputs=[subregion_info, subregion_img, subregion_hist_img]
         )
-    with gr.Tab("3) Comparative Analysis"):
-        gr.Markdown("""
-        **Compare Two Sequences**
-        Upload or paste two FASTA sequences to compare their SHAP patterns.
-        The sequences will be normalized to the same length for comparison.
-        **Color Scale**:
-        - Red: Sequence 2 is more human-like in this region
-        - Blue: Sequence 1 is more human-like in this region
-        - White: No substantial difference
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                file_input1 = gr.File(
-                    label="Upload first FASTA file",
-                    file_types=[".fasta", ".fa", ".txt"],
-                    type="filepath"
-                )
-                text_input1 = gr.Textbox(
-                    label="Or paste first FASTA sequence",
-                    placeholder=">sequence1\nACGTACGT...",
-                    lines=5
-                )
-            with gr.Column(scale=1):
-                file_input2 = gr.File(
-                    label="Upload second FASTA file",
-                    file_types=[".fasta", ".fa", ".txt"],
-                    type="filepath"
-                )
-                text_input2 = gr.Textbox(
-                    label="Or paste second FASTA sequence",
-                    placeholder=">sequence2\nACGTACGT...",
-                    lines=5
-                )
-        compare_btn = gr.Button("Compare Sequences", variant="primary")
-        comparison_text = gr.Textbox(
-            label="Comparison Results",
-            lines=12,
-            interactive=False
-        )
-        with gr.Row():
-            diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
-            diff_hist = gr.Image(label="Distribution of SHAP Differences")
-        compare_btn.click(
-            analyze_sequence_comparison,
-            inputs=[file_input1, file_input2, text_input1, text_input2],
-            outputs=[comparison_text, diff_heatmap, diff_hist]
-        )
     gr.Markdown("""
     ### Interface Features
@@ -660,22 +578,7 @@ with gr.Blocks(css=css) as iface:
       - GC content
       - Fraction of positions pushing human vs. non-human
       - Simple logic-based classification
-    - **Sequence Comparison**:
-      - Compare two sequences to identify regions of difference
-      - Normalized comparison to handle different sequence lengths
-      - Statistical summary of differences
     """)
 if __name__ == "__main__":
-    plt.style.use('default')
-    plt.rcParams['figure.figsize'] = [10, 6]
-    plt.rcParams['figure.dpi'] = 100
-    plt.rcParams['font.size'] = 10
-    iface.launch(
-        share=False,
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_api=False,
-        debug=False
-    )

 import gradio as gr
 import torch
+import joblib
 import numpy as np
 from itertools import product
 import torch.nn as nn
 import matplotlib.colors as mcolors
 import io
 from PIL import Image
 ###############################################################################
 # 1. MODEL DEFINITION
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
+        vec = vec / total_kmers
     return vec
     """
     model.eval()
     with torch.no_grad():
+        # Baseline
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
+        baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human' class
+        # Zeroing each feature to measure impact
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
             prob = probs[0, 1].item()
             impact = baseline_prob - prob
             shap_values.append(impact)
+            x_zeroed[0, i] = original_val  # restore
     return np.array(shap_values), baseline_prob
 ###############################################################################
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
+    """
+    Returns an array of per-base SHAP contributions by averaging
+    the k-mer SHAP values of all k-mers covering that base.
+    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
+    """
+    Finds the subregion of length `window_size` that has the maximum
+    (mode="max") or minimum (mode="min") average SHAP.
+    Returns (best_start, best_end, best_avg).
+    """
     n = len(shap_means)
     if n == 0:
         return (0, 0, 0.0)
     if window_size >= n:
+        # entire sequence
         avg_val = float(np.mean(shap_means))
         return (0, n, avg_val)
+    # We'll build csum of length n+1
     csum = np.zeros(n + 1, dtype=np.float32)
     csum[1:] = np.cumsum(shap_means)
 ###############################################################################
 def fig_to_image(fig):
+    """Convert a Matplotlib figure to a PIL Image for Gradio."""
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
     return img
 def get_zero_centered_cmap():
+    """
+    Creates a custom diverging colormap that is:
+    - Blue for negative
+    - White for zero
+    - Red for positive
+    """
     colors = [
+        (0.0, 'blue'),   # negative
+        (0.5, 'white'),  # zero
+        (1.0, 'red')     # positive
     ]
+    cmap = mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
+    return cmap
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
+    """
+    Plots a 1D heatmap of per-base SHAP contributions with a custom colormap:
+    - Negative = blue
+    - 0 = white
+    - Positive = red
+    """
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     if len(local_shap) == 0:
         local_shap = np.array([0.0])
+    # Build 2D array for imshow
     heatmap_data = local_shap.reshape(1, -1)
+    # Force symmetrical range
     min_val = np.min(local_shap)
     max_val = np.max(local_shap)
     extent = max(abs(min_val), abs(max_val))
+    # Create custom colormap
+    custom_cmap = get_zero_centered_cmap()
+    # Create figure with adjusted height ratio
+    fig, ax = plt.subplots(figsize=(12, 1.8))  # Reduced height
+    # Plot heatmap
     cax = ax.imshow(
         heatmap_data,
         aspect='auto',
+        cmap=custom_cmap,
         vmin=-extent,
+        vmax=+extent
     )
+    # Configure colorbar with more subtle positioning
     cbar = plt.colorbar(
         cax,
         orientation='horizontal',
+        pad=0.25,  # Reduced padding
+        aspect=40,  # Make colorbar thinner
+        shrink=0.8  # Make colorbar shorter than plot width
     )
+    # Style the colorbar
+    cbar.ax.tick_params(labelsize=8)  # Smaller tick labels
+    cbar.set_label(
+        'SHAP Contribution',
+        fontsize=9,
+        labelpad=5
+    )
+    # Configure main plot
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence', fontsize=10)
     ax.set_title(f"{title}{subtitle}", pad=10)
+    # Fine-tune layout
+    plt.subplots_adjust(
+        bottom=0.25,  # Reduced bottom margin
+        left=0.05,    # Tighter left margin
+        right=0.95    # Tighter right margin
+    )
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
+    """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
+    # Sort by absolute importance
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
+    # negative -> blue, positive -> red
     colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP Value (impact on model output)')
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
+    """
+    Simple histogram of SHAP values in the subregion.
+    """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
     return fig
 def compute_gc_content(sequence):
+    """Compute %GC in the sequence (A, C, G, T)."""
     if not sequence:
         return 0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
+# 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
+def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     """
+    Analyzes the entire genome, returning classification, full-genome heatmap,
+    top k-mer bar plot, and identifies subregions with strongest positive/negative push.
     """
+    # Handle input
+    if fasta_text.strip():
+        text = fasta_text.strip()
+    elif file_obj is not None:
+        try:
+            with open(file_obj, 'r') as f:
+                text = f.read()
+        except Exception as e:
+            return (f"Error reading file: {str(e)}", None, None, None, None)
+    else:
+        return ("Please provide a FASTA sequence.", None, None, None, None)
+    # Parse FASTA
+    sequences = parse_fasta(text)
+    if not sequences:
+        return ("No valid FASTA sequences found.", None, None, None, None)
+    header, seq = sequences[0]
+    # Load model and scaler
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
+        # Use weights_only=True for safer loading
+        state_dict = torch.load('model.pt', map_location=device, weights_only=True)
+        model = VirusClassifier(256).to(device)
+        model.load_state_dict(state_dict)
+        scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
+    # Vectorize + scale
+    freq_vector = sequence_to_kmer_vector(seq)
+    scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
+    x_tensor = torch.FloatTensor(scaled_vector).to(device)
+    # SHAP + classification
+    shap_values, prob_human = calculate_shap_values(model, x_tensor)
+    prob_nonhuman = 1.0 - prob_human
+    classification = "Human" if prob_human > 0.5 else "Non-human"
+    confidence = max(prob_human, prob_nonhuman)
+    # Per-base SHAP
+    shap_means = compute_positionwise_scores(seq, shap_values, k=4)
+    # Find the most "human-pushing" region
+    (max_start, max_end, max_avg) = find_extreme_subregion(shap_means, window_size, mode="max")
+    # Find the most "non-human–pushing" region
+    (min_start, min_end, min_avg) = find_extreme_subregion(shap_means, window_size, mode="min")
+    # Build results text
+    results_text = (
+        f"Sequence: {header}\n"
+        f"Length: {len(seq):,} bases\n"
+        f"Classification: {classification}\n"
+        f"Confidence: {confidence:.3f}\n"
+        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
+        f"---\n"
+        f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
+        f"Start: {max_start}, End: {max_end}, Avg SHAP: {max_avg:.4f}\n\n"
+        f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
+        f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
+    )
+    # K-mer importance plot
+    kmers = [''.join(p) for p in product("ACGT", repeat=4)]
+    bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
+    bar_img = fig_to_image(bar_fig)
+    # Full-genome SHAP heatmap
+    heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
+    heatmap_img = fig_to_image(heatmap_fig)
+    # Store data for subregion analysis
+    state_dict_out = {
+        "seq": seq,
+        "shap_means": shap_means
+    }
+    return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
+# 8. SUBREGION ANALYSIS (Gradio Step 2)
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
+    """
+    Takes stored data from step 1 and a user-chosen region.
+    Returns a subregion heatmap, histogram, and some stats (GC, average SHAP).
+    """
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
+    # Validate bounds
     region_start = int(region_start)
     region_end = int(region_end)
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None)
+    # Subsequence
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
+    # Some stats
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
+    # Fraction pushing toward human vs. non-human
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
+    # Simple logic-based interpretation
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
         f"Subregion interpretation: {region_classification}\n"
     )
+    # Plot region as small heatmap
     heatmap_fig = plot_linear_heatmap(
         shap_means,
         title="Subregion SHAP",
     )
     heatmap_img = fig_to_image(heatmap_fig)
+    # Plot histogram of SHAP in region
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
     return (region_info, heatmap_img, hist_img)
 ###############################################################################
+# 9. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
                     placeholder=">sequence_name\nACGTACGT...",
                     lines=5
                 )
+                top_k = gr.Slider(
                     minimum=5,
                     maximum=30,
                     value=10,
                     step=1,
                     label="Number of top k-mers to display"
                 )
+                win_size = gr.Slider(
                     minimum=100,
                     maximum=5000,
                     value=500,
         seq_state = gr.State()
         header_state = gr.State()
+        # analyze_sequence(...) returns 5 items
         analyze_btn.click(
             analyze_sequence,
+            inputs=[file_input, top_k, text_input, win_size],
             outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
         )
             inputs=[seq_state, header_state, region_start, region_end],
             outputs=[subregion_info, subregion_img, subregion_hist_img]
         )
     gr.Markdown("""
     ### Interface Features
       - GC content
       - Fraction of positions pushing human vs. non-human
       - Simple logic-based classification
     """)
 if __name__ == "__main__":
+    iface.launch()