Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12, 2025

Commit

77621ec

verified ·

1 Parent(s): f5ea8d6

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -230

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 import io
 from PIL import Image
 ###############################################################################
 # 1. MODEL DEFINITION
@@ -38,15 +39,12 @@ class VirusClassifier(nn.Module):
 ###############################################################################
 def parse_fasta(text):
-    """Parse FASTA formatted text into a list of (header, sequence)."""
     sequences = []
     current_header = None
     current_sequence = []
     for line in text.strip().split('\n'):
         line = line.strip()
-        if not line:
-            continue
         if line.startswith('>'):
             if current_header:
                 sequences.append((current_header, ''.join(current_sequence)))
@@ -59,20 +57,16 @@ def parse_fasta(text):
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
-    """Convert a sequence to a k-mer frequency vector for classification."""
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
     for i in range(len(sequence) - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
             vec[kmer_dict[kmer]] += 1
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
-        vec = vec / total_kmers
     return vec
 ###############################################################################
@@ -80,18 +74,11 @@ def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
-    """
-    Calculate SHAP values using a simple ablation approach.
-    Returns shap_values, prob_human
-    """
     model.eval()
     with torch.no_grad():
-        # Baseline
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
-        baseline_prob = baseline_probs[0, 1].item()  # Probability of 'human' class
-        # Zeroing each feature to measure impact
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
@@ -100,9 +87,8 @@ def calculate_shap_values(model, x_tensor):
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
             prob = probs[0, 1].item()
-            impact = baseline_prob - prob
-            shap_values.append(impact)
-            x_zeroed[0, i] = original_val  # restore
     return np.array(shap_values), baseline_prob
 ###############################################################################
@@ -110,27 +96,19 @@ def calculate_shap_values(model, x_tensor):
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
-    """
-    Returns an array of per-base SHAP contributions by averaging
-    the k-mer SHAP values of all k-mers covering that base.
-    """
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     seq_len = len(sequence)
     shap_sums = np.zeros(seq_len, dtype=np.float32)
     coverage = np.zeros(seq_len, dtype=np.float32)
     for i in range(seq_len - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
             val = shap_values[kmer_dict[kmer]]
-            shap_sums[i : i + k] += val
-            coverage[i : i + k] += 1
     with np.errstate(divide='ignore', invalid='ignore'):
         shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
     return shap_means
 ###############################################################################
@@ -138,39 +116,22 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
-    """
-    Finds the subregion of length `window_size` that has the maximum
-    (mode="max") or minimum (mode="min") average SHAP.
-    Returns (best_start, best_end, best_avg).
-    """
     n = len(shap_means)
-    if n == 0:
-        return (0, 0, 0.0)
     if window_size >= n:
-        # entire sequence
-        avg_val = float(np.mean(shap_means))
-        return (0, n, avg_val)
-    # We'll build csum of length n+1
     csum = np.zeros(n + 1, dtype=np.float32)
     csum[1:] = np.cumsum(shap_means)
     best_start = 0
     best_sum = csum[window_size] - csum[0]
     best_avg = best_sum / window_size
     for start in range(1, n - window_size + 1):
         wsum = csum[start + window_size] - csum[start]
         wavg = wsum / window_size
-        if mode == "max":
-            if wavg > best_avg:
-                best_avg = wavg
-                best_start = start
-        else:  # mode == "min"
-            if wavg < best_avg:
-                best_avg = wavg
-                best_start = start
     return (best_start, best_start + window_size, float(best_avg))
 ###############################################################################
@@ -178,7 +139,6 @@ def find_extreme_subregion(shap_means, window_size=500, mode="max"):
 ###############################################################################
 def fig_to_image(fig):
-    """Convert a Matplotlib figure to a PIL Image for Gradio."""
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
@@ -187,104 +147,41 @@ def fig_to_image(fig):
     return img
 def get_zero_centered_cmap():
-    """
-    Creates a custom diverging colormap that is:
-    - Blue for negative
-    - White for zero
-    - Red for positive
-    """
-    colors = [
-        (0.0, 'blue'),   # negative
-        (0.5, 'white'),  # zero
-        (1.0, 'red')     # positive
-    ]
-    cmap = mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
-    return cmap
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
-    """
-    Plots a 1D heatmap of per-base SHAP contributions with a custom colormap:
-    - Negative = blue
-    - 0 = white
-    - Positive = red
-    """
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     else:
         local_shap = shap_means
         subtitle = ""
     if len(local_shap) == 0:
         local_shap = np.array([0.0])
-    # Build 2D array for imshow
     heatmap_data = local_shap.reshape(1, -1)
-    # Force symmetrical range
     min_val = np.min(local_shap)
     max_val = np.max(local_shap)
     extent = max(abs(min_val), abs(max_val))
-    # Create custom colormap
-    custom_cmap = get_zero_centered_cmap()
-    # Create figure with adjusted height ratio
-    fig, ax = plt.subplots(figsize=(12, 1.8))  # Reduced height
-    # Plot heatmap
-    cax = ax.imshow(
-        heatmap_data,
-        aspect='auto',
-        cmap=custom_cmap,
-        vmin=-extent,
-        vmax=+extent
-    )
-    # Configure colorbar with more subtle positioning
-    cbar = plt.colorbar(
-        cax,
-        orientation='horizontal',
-        pad=0.25,  # Reduced padding
-        aspect=40,  # Make colorbar thinner
-        shrink=0.8  # Make colorbar shorter than plot width
-    )
-    # Style the colorbar
-    cbar.ax.tick_params(labelsize=8)  # Smaller tick labels
-    cbar.set_label(
-        'SHAP Contribution',
-        fontsize=9,
-        labelpad=5
-    )
-    # Configure main plot
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence', fontsize=10)
     ax.set_title(f"{title}{subtitle}", pad=10)
-    # Fine-tune layout
-    plt.subplots_adjust(
-        bottom=0.25,  # Reduced bottom margin
-        left=0.05,    # Tighter left margin
-        right=0.95    # Tighter right margin
-    )
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
-    """Create a bar plot of the most important k-mers."""
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
-    # Sort by absolute importance
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
-    # negative -> blue, positive -> red
     colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP Value (impact on model output)')
@@ -294,9 +191,6 @@ def create_importance_bar_plot(shap_values, kmers, top_k=10):
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
-    """
-    Simple histogram of SHAP values in the subregion.
-    """
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
@@ -308,9 +202,7 @@ def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     return fig
 def compute_gc_content(sequence):
-    """Compute %GC in the sequence (A, C, G, T)."""
-    if not sequence:
-        return 0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
@@ -319,11 +211,6 @@ def compute_gc_content(sequence):
 ###############################################################################
 def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
-    """
-    Analyzes the entire genome, returning classification, full-genome heatmap,
-    top k-mer bar plot, and identifies subregions with strongest positive/negative push.
-    """
-    # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
@@ -335,46 +222,33 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     else:
         return ("Please provide a FASTA sequence.", None, None, None, None)
-    # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
         return ("No valid FASTA sequences found.", None, None, None, None)
     header, seq = sequences[0]
-    # Load model and scaler
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
-        # Use weights_only=True for safer loading
         state_dict = torch.load('model.pt', map_location=device, weights_only=True)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
         return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
-    # Vectorize + scale
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
-    # SHAP + classification
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
     prob_nonhuman = 1.0 - prob_human
     classification = "Human" if prob_human > 0.5 else "Non-human"
     confidence = max(prob_human, prob_nonhuman)
-    # Per-base SHAP
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
-    # Find the most "human-pushing" region
-    (max_start, max_end, max_avg) = find_extreme_subregion(shap_means, window_size, mode="max")
-    # Find the most "non-human–pushing" region
-    (min_start, min_end, min_avg) = find_extreme_subregion(shap_means, window_size, mode="min")
-    # Build results text
     results_text = (
         f"Sequence: {header}\n"
         f"Length: {len(seq):,} bases\n"
@@ -388,20 +262,14 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
         f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
-    # K-mer importance plot
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
-    # Full-genome SHAP heatmap
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
-    # Store data for subregion analysis
-    state_dict_out = {
-        "seq": seq,
-        "shap_means": shap_means
-    }
     return (results_text, bar_img, heatmap_img, state_dict_out, header)
@@ -410,45 +278,28 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
-    """
-    Takes stored data from step 1 and a user-chosen region.
-    Returns a subregion heatmap, histogram, and some stats (GC, average SHAP).
-    """
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
-    # Validate bounds
     region_start = int(region_start)
     region_end = int(region_end)
     region_start = max(0, min(region_start, len(seq)))
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None)
-    # Subsequence
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
-    # Some stats
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
-    # Fraction pushing toward human vs. non-human
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
-    # Simple logic-based interpretation
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
         region_classification = "Likely pushing toward non-human"
     else:
         region_classification = "Near neutral (no strong push)"
     region_info = (
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
@@ -458,25 +309,100 @@ def analyze_subregion(state, header, region_start, region_end):
         f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
         f"Subregion interpretation: {region_classification}\n"
     )
-    # Plot region as small heatmap
-    heatmap_fig = plot_linear_heatmap(
-        shap_means,
-        title="Subregion SHAP",
-        start=region_start,
-        end=region_end
-    )
     heatmap_img = fig_to_image(heatmap_fig)
-    # Plot histogram of SHAP in region
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
     return (region_info, heatmap_img, hist_img)
 ###############################################################################
-# 9. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
@@ -497,75 +423,72 @@ with gr.Blocks(css=css) as iface:
     with gr.Tab("1) Full-Sequence Analysis"):
         with gr.Row():
             with gr.Column(scale=1):
-                file_input = gr.File(
-                    label="Upload FASTA file",
-                    file_types=[".fasta", ".fa", ".txt"],
-                    type="filepath"
-                )
-                text_input = gr.Textbox(
-                    label="Or paste FASTA sequence",
-                    placeholder=">sequence_name\nACGTACGT...",
-                    lines=5
-                )
-                top_k = gr.Slider(
-                    minimum=5,
-                    maximum=30,
-                    value=10,
-                    step=1,
-                    label="Number of top k-mers to display"
-                )
-                win_size = gr.Slider(
-                    minimum=100,
-                    maximum=5000,
-                    value=500,
-                    step=100,
-                    label="Window size for 'most pushing' subregions"
-                )
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
             with gr.Column(scale=2):
-                results_box = gr.Textbox(
-                    label="Classification Results", lines=12, interactive=False
-                )
                 kmer_img = gr.Image(label="Top k-mer SHAP")
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
         seq_state = gr.State()
         header_state = gr.State()
-        # analyze_sequence(...) returns 5 items
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
             outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
         )
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
-        Select start/end positions to view local SHAP signals, distribution, and GC content.
-        The heatmap also uses the same Blue-White-Red scale.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
             region_end = gr.Number(label="Region End", value=500)
             region_btn = gr.Button("Analyze Subregion")
-        subregion_info = gr.Textbox(
-            label="Subregion Analysis",
-            lines=7,
-            interactive=False
-        )
         with gr.Row():
             subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
         region_btn.click(
             analyze_subregion,
             inputs=[seq_state, header_state, region_start, region_end],
             outputs=[subregion_info, subregion_img, subregion_hist_img]
         )
     gr.Markdown("""
     ### Interface Features
     - **Overall Classification** (human vs non-human) using k-mer frequencies.
@@ -578,7 +501,11 @@ with gr.Blocks(css=css) as iface:
       - GC content
       - Fraction of positions pushing human vs. non-human
       - Simple logic-based classification
     """)
 if __name__ == "__main__":
-    iface.launch()

 import matplotlib.colors as mcolors
 import io
 from PIL import Image
+from scipy.interpolate import interp1d
 ###############################################################################
 # 1. MODEL DEFINITION
 ###############################################################################
 def parse_fasta(text):
     sequences = []
     current_header = None
     current_sequence = []
     for line in text.strip().split('\n'):
         line = line.strip()
+        if not line: continue
         if line.startswith('>'):
             if current_header:
                 sequences.append((current_header, ''.join(current_sequence)))
     return sequences
 def sequence_to_kmer_vector(sequence: str, k: int = 4) -> np.ndarray:
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     vec = np.zeros(len(kmers), dtype=np.float32)
     for i in range(len(sequence) - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
             vec[kmer_dict[kmer]] += 1
     total_kmers = len(sequence) - k + 1
     if total_kmers > 0:
+        vec /= total_kmers
     return vec
 ###############################################################################
 ###############################################################################
 def calculate_shap_values(model, x_tensor):
     model.eval()
     with torch.no_grad():
         baseline_output = model(x_tensor)
         baseline_probs = torch.softmax(baseline_output, dim=1)
+        baseline_prob = baseline_probs[0, 1].item()  # Prob of 'human'
         shap_values = []
         x_zeroed = x_tensor.clone()
         for i in range(x_tensor.shape[1]):
             output = model(x_zeroed)
             probs = torch.softmax(output, dim=1)
             prob = probs[0, 1].item()
+            shap_values.append(baseline_prob - prob)
+            x_zeroed[0, i] = original_val
     return np.array(shap_values), baseline_prob
 ###############################################################################
 ###############################################################################
 def compute_positionwise_scores(sequence, shap_values, k=4):
     kmers = [''.join(p) for p in product("ACGT", repeat=k)]
     kmer_dict = {km: i for i, km in enumerate(kmers)}
     seq_len = len(sequence)
     shap_sums = np.zeros(seq_len, dtype=np.float32)
     coverage = np.zeros(seq_len, dtype=np.float32)
     for i in range(seq_len - k + 1):
         kmer = sequence[i:i+k]
         if kmer in kmer_dict:
             val = shap_values[kmer_dict[kmer]]
+            shap_sums[i:i+k] += val
+            coverage[i:i+k] += 1
     with np.errstate(divide='ignore', invalid='ignore'):
         shap_means = np.where(coverage > 0, shap_sums / coverage, 0.0)
     return shap_means
 ###############################################################################
 ###############################################################################
 def find_extreme_subregion(shap_means, window_size=500, mode="max"):
     n = len(shap_means)
+    if n == 0: return (0, 0, 0.0)
     if window_size >= n:
+        return (0, n, float(np.mean(shap_means)))
     csum = np.zeros(n + 1, dtype=np.float32)
     csum[1:] = np.cumsum(shap_means)
     best_start = 0
     best_sum = csum[window_size] - csum[0]
     best_avg = best_sum / window_size
     for start in range(1, n - window_size + 1):
         wsum = csum[start + window_size] - csum[start]
         wavg = wsum / window_size
+        if mode == "max" and wavg > best_avg:
+            best_avg = wavg; best_start = start
+        elif mode == "min" and wavg < best_avg:
+            best_avg = wavg; best_start = start
     return (best_start, best_start + window_size, float(best_avg))
 ###############################################################################
 ###############################################################################
 def fig_to_image(fig):
     buf = io.BytesIO()
     fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
     buf.seek(0)
     return img
 def get_zero_centered_cmap():
+    colors = [(0.0, 'blue'), (0.5, 'white'), (1.0, 'red')]
+    return mcolors.LinearSegmentedColormap.from_list("blue_white_red", colors)
 def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, end=None):
     if start is not None and end is not None:
         local_shap = shap_means[start:end]
         subtitle = f" (positions {start}-{end})"
     else:
         local_shap = shap_means
         subtitle = ""
     if len(local_shap) == 0:
         local_shap = np.array([0.0])
     heatmap_data = local_shap.reshape(1, -1)
     min_val = np.min(local_shap)
     max_val = np.max(local_shap)
     extent = max(abs(min_val), abs(max_val))
+    cmap = get_zero_centered_cmap()
+    fig, ax = plt.subplots(figsize=(12, 1.8))
+    cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
+    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
+    cbar.ax.tick_params(labelsize=8)
+    cbar.set_label('SHAP Contribution', fontsize=9, labelpad=5)
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence', fontsize=10)
     ax.set_title(f"{title}{subtitle}", pad=10)
+    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
     return fig
 def create_importance_bar_plot(shap_values, kmers, top_k=10):
     plt.rcParams.update({'font.size': 10})
     fig = plt.figure(figsize=(10, 5))
     indices = np.argsort(np.abs(shap_values))[-top_k:]
     values = shap_values[indices]
     features = [kmers[i] for i in indices]
     colors = ['#99ccff' if v < 0 else '#ff9999' for v in values]
     plt.barh(range(len(values)), values, color=colors)
     plt.yticks(range(len(values)), features)
     plt.xlabel('SHAP Value (impact on model output)')
     return fig
 def plot_shap_histogram(shap_array, title="SHAP Distribution in Region"):
     fig, ax = plt.subplots(figsize=(6, 4))
     ax.hist(shap_array, bins=30, color='gray', edgecolor='black')
     ax.axvline(0, color='red', linestyle='--', label='0.0')
     return fig
 def compute_gc_content(sequence):
+    if not sequence: return 0
     gc_count = sequence.count('G') + sequence.count('C')
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
 def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
     if fasta_text.strip():
         text = fasta_text.strip()
     elif file_obj is not None:
     else:
         return ("Please provide a FASTA sequence.", None, None, None, None)
     sequences = parse_fasta(text)
     if not sequences:
         return ("No valid FASTA sequences found.", None, None, None, None)
     header, seq = sequences[0]
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     try:
         state_dict = torch.load('model.pt', map_location=device, weights_only=True)
         model = VirusClassifier(256).to(device)
         model.load_state_dict(state_dict)
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
         return (f"Error loading model/scaler: {str(e)}", None, None, None, None)
     freq_vector = sequence_to_kmer_vector(seq)
     scaled_vector = scaler.transform(freq_vector.reshape(1, -1))
     x_tensor = torch.FloatTensor(scaled_vector).to(device)
     shap_values, prob_human = calculate_shap_values(model, x_tensor)
     prob_nonhuman = 1.0 - prob_human
     classification = "Human" if prob_human > 0.5 else "Non-human"
     confidence = max(prob_human, prob_nonhuman)
     shap_means = compute_positionwise_scores(seq, shap_values, k=4)
+    max_start, max_end, max_avg = find_extreme_subregion(shap_means, window_size, mode="max")
+    min_start, min_end, min_avg = find_extreme_subregion(shap_means, window_size, mode="min")
     results_text = (
         f"Sequence: {header}\n"
         f"Length: {len(seq):,} bases\n"
         f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
     kmers = [''.join(p) for p in product("ACGT", repeat=4)]
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
+    state_dict_out = {"seq": seq, "shap_means": shap_means}
     return (results_text, bar_img, heatmap_img, state_dict_out, header)
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
     if not state or "seq" not in state or "shap_means" not in state:
         return ("No sequence data found. Please run Step 1 first.", None, None)
     seq = state["seq"]
     shap_means = state["shap_means"]
     region_start = int(region_start)
     region_end = int(region_end)
     region_start = max(0, min(region_start, len(seq)))
     region_end = max(0, min(region_end, len(seq)))
     if region_end <= region_start:
         return ("Invalid region range. End must be > Start.", None, None)
     region_seq = seq[region_start:region_end]
     region_shap = shap_means[region_start:region_end]
     gc_percent = compute_gc_content(region_seq)
     avg_shap = float(np.mean(region_shap))
     positive_fraction = np.mean(region_shap > 0)
     negative_fraction = np.mean(region_shap < 0)
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
         region_classification = "Likely pushing toward non-human"
     else:
         region_classification = "Near neutral (no strong push)"
     region_info = (
         f"Analyzing subregion of {header} from {region_start} to {region_end}\n"
         f"Region length: {len(region_seq)} bases\n"
         f"Fraction with SHAP < 0 (toward non-human): {negative_fraction:.2f}\n"
         f"Subregion interpretation: {region_classification}\n"
     )
+    heatmap_fig = plot_linear_heatmap(shap_means, title="Subregion SHAP", start=region_start, end=region_end)
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(region_shap, title="SHAP Distribution in Subregion")
     hist_img = fig_to_image(hist_fig)
     return (region_info, heatmap_img, hist_img)
+###############################################################################
+# 9. COMPARISON ANALYSIS FUNCTIONS
+###############################################################################
+def normalize_shap_lengths(shap1, shap2, num_points=1000):
+    x1 = np.linspace(0, 1, len(shap1))
+    x2 = np.linspace(0, 1, len(shap2))
+    f1 = interp1d(x1, shap1, kind='linear')
+    f2 = interp1d(x2, shap2, kind='linear')
+    x_new = np.linspace(0, 1, num_points)
+    shap1_norm = f1(x_new)
+    shap2_norm = f2(x_new)
+    return shap1_norm, shap2_norm
+def compute_shap_difference(shap1_norm, shap2_norm):
+    return shap2_norm - shap1_norm
+def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
+    heatmap_data = shap_diff.reshape(1, -1)
+    extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
+    cmap = get_zero_centered_cmap()
+    fig, ax = plt.subplots(figsize=(12, 1.8))
+    cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
+    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
+    cbar.ax.tick_params(labelsize=8)
+    cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
+    ax.set_yticks([])
+    ax.set_xlabel('Normalized Position (0-100%)', fontsize=10)
+    ax.set_title(title, pad=10)
+    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
+    return fig
+def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
+    # Analyze first sequence
+    res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
+    if isinstance(res1[0], str) and "Error" in res1[0]:
+        return (f"Error in sequence 1: {res1[0]}", None, None)
+    # Analyze second sequence
+    res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
+    if isinstance(res2[0], str) and "Error" in res2[0]:
+        return (f"Error in sequence 2: {res2[0]}", None, None)
+    shap1 = res1[3]["shap_means"]
+    shap2 = res2[3]["shap_means"]
+    shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
+    shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
+    avg_diff = np.mean(shap_diff)
+    std_diff = np.std(shap_diff)
+    max_diff = np.max(shap_diff)
+    min_diff = np.min(shap_diff)
+    threshold = 0.05
+    substantial_diffs = np.abs(shap_diff) > threshold
+    frac_different = np.mean(substantial_diffs)
+    classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
+    classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
+    len1_formatted = "{:,}".format(len(shap1))
+    len2_formatted = "{:,}".format(len(shap2))
+    frac_formatted = "{:.2%}".format(frac_different)
+    comparison_text = (
+        "Sequence Comparison Results:\n"
+        f"Sequence 1: {res1[4]}\n"
+        f"Length: {len1_formatted} bases\n"
+        f"Classification: {classification1}\n\n"
+        f"Sequence 2: {res2[4]}\n"
+        f"Length: {len2_formatted} bases\n"
+        f"Classification: {classification2}\n\n"
+        "Comparison Statistics:\n"
+        f"Average SHAP difference: {avg_diff:.4f}\n"
+        f"Standard deviation: {std_diff:.4f}\n"
+        f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
+        f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
+        f"Fraction of positions with substantial differences: {frac_formatted}\n\n"
+        "Interpretation:\n"
+        "Positive values (red) indicate regions where Sequence 2 is more 'human-like'\n"
+        "Negative values (blue) indicate regions where Sequence 1 is more 'human-like'"
+    )
+    heatmap_fig = plot_comparative_heatmap(shap_diff)
+    heatmap_img = fig_to_image(heatmap_fig)
+    hist_fig = plot_shap_histogram(shap_diff, title="Distribution of SHAP Differences")
+    hist_img = fig_to_image(hist_fig)
+    return comparison_text, heatmap_img, hist_img
 ###############################################################################
+# 10. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
     with gr.Tab("1) Full-Sequence Analysis"):
         with gr.Row():
             with gr.Column(scale=1):
+                file_input = gr.File(label="Upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input = gr.Textbox(label="Or paste FASTA sequence", placeholder=">sequence_name\nACGTACGT...", lines=5)
+                top_k = gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of top k-mers to display")
+                win_size = gr.Slider(minimum=100, maximum=5000, value=500, step=100, label="Window size for 'most pushing' subregions")
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
             with gr.Column(scale=2):
+                results_box = gr.Textbox(label="Classification Results", lines=12, interactive=False)
                 kmer_img = gr.Image(label="Top k-mer SHAP")
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap (Blue=neg, White=0, Red=pos)")
         seq_state = gr.State()
         header_state = gr.State()
         analyze_btn.click(
             analyze_sequence,
             inputs=[file_input, top_k, text_input, win_size],
             outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
         )
     with gr.Tab("2) Subregion Exploration"):
         gr.Markdown("""
         **Subregion Analysis**
+        Select start/end positions to view local SHAP signals, distribution, GC content, etc.
+        The heatmap also uses the same Blue-White-Red scale.
         """)
         with gr.Row():
             region_start = gr.Number(label="Region Start", value=0)
             region_end = gr.Number(label="Region End", value=500)
             region_btn = gr.Button("Analyze Subregion")
+        subregion_info = gr.Textbox(label="Subregion Analysis", lines=7, interactive=False)
         with gr.Row():
             subregion_img = gr.Image(label="Subregion SHAP Heatmap (B-W-R)")
             subregion_hist_img = gr.Image(label="SHAP Distribution (Histogram)")
         region_btn.click(
             analyze_subregion,
             inputs=[seq_state, header_state, region_start, region_end],
             outputs=[subregion_info, subregion_img, subregion_hist_img]
         )
+    with gr.Tab("3) Comparative Analysis"):
+        gr.Markdown("""
+        **Compare Two Sequences**
+        Upload or paste two FASTA sequences to compare their SHAP patterns.
+        The sequences will be normalized to the same length for comparison.
+        **Color Scale**:
+        - Red: Sequence 2 is more human-like in this region
+        - Blue: Sequence 1 is more human-like in this region
+        - White: No substantial difference
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_input1 = gr.File(label="Upload first FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input1 = gr.Textbox(label="Or paste first FASTA sequence", placeholder=">sequence1\nACGTACGT...", lines=5)
+            with gr.Column(scale=1):
+                file_input2 = gr.File(label="Upload second FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
+                text_input2 = gr.Textbox(label="Or paste second FASTA sequence", placeholder=">sequence2\nACGTACGT...", lines=5)
+        compare_btn = gr.Button("Compare Sequences", variant="primary")
+        comparison_text = gr.Textbox(label="Comparison Results", lines=12, interactive=False)
+        with gr.Row():
+            diff_heatmap = gr.Image(label="SHAP Difference Heatmap")
+            diff_hist = gr.Image(label="Distribution of SHAP Differences")
+        compare_btn.click(
+            analyze_sequence_comparison,
+            inputs=[file_input1, file_input2, text_input1, text_input2],
+            outputs=[comparison_text, diff_heatmap, diff_hist]
+        )
     gr.Markdown("""
     ### Interface Features
     - **Overall Classification** (human vs non-human) using k-mer frequencies.
       - GC content
       - Fraction of positions pushing human vs. non-human
       - Simple logic-based classification
+    - **Sequence Comparison**:
+      - Compare two sequences to identify regions of difference
+      - Normalized comparison to handle different sequence lengths
+      - Statistical summary of differences
     """)
 if __name__ == "__main__":
+    iface.launch()