Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12, 2025

Commit

552aec4

verified ·

1 Parent(s): 6d0235b

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -34

app.py CHANGED Viewed

@@ -133,7 +133,52 @@ def compute_positionwise_scores(sequence, shap_values, k=4):
     return shap_means
 ###############################################################################
-# 5. PLOTTING / UTILITIES
 ###############################################################################
 def fig_to_image(fig):
@@ -150,7 +195,7 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     Optionally can show only a subrange (start:end).
-    We'll add extra bottom margin to avoid x-axis overlap.
     """
     if start is not None and end is not None:
         shap_means = shap_means[start:end]
@@ -162,17 +207,17 @@ def plot_linear_heatmap(shap_means, title="Per-base SHAP Heatmap", start=None, e
     fig, ax = plt.subplots(figsize=(12, 2))
     cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
-    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.3)
     cbar.set_label('SHAP Contribution')
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence')
     ax.set_title(f"{title}{subtitle}")
-    # Extra spacing for x-axis labels
-    plt.tight_layout()
-    # Or you can do something like:
-    # plt.subplots_adjust(bottom=0.2)
     return fig
@@ -219,11 +264,14 @@ def compute_gc_content(sequence):
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
-# 6. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
-def analyze_sequence(file_obj, top_kmers=10, fasta_text=""):
-    """Analyzes the entire genome, returning classification and a heatmap."""
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
@@ -232,14 +280,14 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text=""):
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
-            return (f"Error reading file: {str(e)}", None, None, None, None)
     else:
-        return ("Please provide a FASTA sequence.", None, None, None, None)
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
-        return ("No valid FASTA sequences found.", None, None, None, None)
     header, seq = sequences[0]
@@ -250,7 +298,7 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text=""):
         model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
-        return (f"Error loading model: {str(e)}", None, None, None, None)
     # Vectorize + scale
     freq_vector = sequence_to_kmer_vector(seq)
@@ -264,13 +312,26 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text=""):
     classification = "Human" if prob_human > 0.5 else "Non-human"
     confidence = max(prob_human, prob_nonhuman)
     # Build results text
     results_text = (
         f"Sequence: {header}\n"
         f"Length: {len(seq):,} bases\n"
         f"Classification: {classification}\n"
         f"Confidence: {confidence:.3f}\n"
-        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})"
     )
     # K-mer importance plot
@@ -278,26 +339,27 @@ def analyze_sequence(file_obj, top_kmers=10, fasta_text=""):
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
-    # Per-base SHAP for entire genome
-    shap_means = compute_positionwise_scores(seq, shap_values, k=4)
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
     # Return:
-    # 1) results text
-    # 2) k-mer bar image
-    # 3) full-genome heatmap
-    # 4) the "state" we need for step 2: (sequence, shap_means)
-    #    We'll store these in a dictionary so we can pass it around in Gradio.
     state_dict = {
         "seq": seq,
         "shap_means": shap_means
     }
-    return (results_text, bar_img, heatmap_img, state_dict, header)
 ###############################################################################
-# 7. SUBREGION ANALYSIS (Gradio Step 2)
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
@@ -333,7 +395,6 @@ def analyze_subregion(state, header, region_start, region_end):
     negative_fraction = np.mean(region_shap < 0)
     # Simple logic-based interpretation
-    # Adjust thresholds as needed
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
@@ -368,7 +429,7 @@ def analyze_subregion(state, header, region_start, region_end):
 ###############################################################################
-# 8. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
@@ -380,7 +441,7 @@ css = """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
     # Virus Host Classifier (with Interactive Region Viewer)
-    **Step 1**: Predict overall viral sequence origin (human vs non-human)
     **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
     """)
@@ -404,26 +465,37 @@ with gr.Blocks(css=css) as iface:
                     step=1,
                     label="Number of top k-mers to display"
                 )
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
             with gr.Column(scale=2):
                 results_box = gr.Textbox(
-                    label="Classification Results", lines=7, interactive=False
                 )
                 kmer_img = gr.Image(label="Top k-mer SHAP")
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
         # Hidden states that store data for step 2
-        # "seq_state" will hold { seq, shap_means }.
-        # "header_state" is optional meta info
         seq_state = gr.State()
         header_state = gr.State()
-        # The "analyze_sequence" function returns 5 values, which we map here:
         analyze_btn.click(
             analyze_sequence,
-            inputs=[file_input, top_k, text_input],
-            outputs=[results_box, kmer_img, genome_img, seq_state, header_state]
         )
     with gr.Tab("2) Subregion Exploration"):
@@ -460,6 +532,9 @@ with gr.Blocks(css=css) as iface:
        - Local SHAP signals (heatmap & histogram)
        - GC content, fraction of bases pushing "human" vs "non-human"
        - Simple logic-based interpretation based on average SHAP
     """)
 if __name__ == "__main__":

     return shap_means
 ###############################################################################
+# 5. FIND EXTREME SHAP REGIONS
+###############################################################################
+def find_extreme_subregion(shap_means, window_size=500, mode="max"):
+    """
+    Finds the subregion of length `window_size` that has the maximum
+    (mode="max") or minimum (mode="min") average SHAP.
+    Returns (best_start, best_end, avg_shap).
+    """
+    n = len(shap_means)
+    if window_size >= n:
+        # If the window is bigger than the entire sequence, return the whole seq
+        avg_val = np.mean(shap_means) if n > 0 else 0.0
+        return (0, n, avg_val)
+    # Rolling sum approach
+    csum = np.cumsum(shap_means)  # csum[i] = sum of shap_means[0..i-1]
+    # function to compute sum in [start, start+window_size)
+    def window_sum(start):
+        end = start + window_size
+        return csum[end] - csum[start]
+    best_start = 0
+    best_avg = None
+    # Initialize the best with the first window
+    best_sum = window_sum(0)
+    best_avg = best_sum / window_size
+    best_start = 0
+    for start in range(1, n - window_size + 1):
+        wsum = window_sum(start)
+        wavg = wsum / window_size
+        if mode == "max":
+            if wavg > best_avg:
+                best_avg = wavg
+                best_start = start
+        else:  # mode == "min"
+            if wavg < best_avg:
+                best_avg = wavg
+                best_start = start
+    return (best_start, best_start + window_size, best_avg)
+###############################################################################
+# 6. PLOTTING / UTILITIES
 ###############################################################################
 def fig_to_image(fig):
     Plots a 1D heatmap of per-base SHAP contributions.
     Negative = push toward Non-Human, Positive = push toward Human.
     Optionally can show only a subrange (start:end).
+    We'll adjust layout so that the colorbar is below the x-axis and doesn't overlap.
     """
     if start is not None and end is not None:
         shap_means = shap_means[start:end]
     fig, ax = plt.subplots(figsize=(12, 2))
     cax = ax.imshow(heatmap_data, aspect='auto', cmap='RdBu_r')
+    # Adjust colorbar with some extra margin
+    # We'll place the colorbar horizontally below
+    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25)
     cbar.set_label('SHAP Contribution')
     ax.set_yticks([])
     ax.set_xlabel('Position in Sequence')
     ax.set_title(f"{title}{subtitle}")
+    # Additional spacing at bottom to avoid overlap
+    plt.subplots_adjust(bottom=0.3)
     return fig
     return (gc_count / len(sequence)) * 100.0
 ###############################################################################
+# 7. MAIN ANALYSIS STEP (Gradio Step 1)
 ###############################################################################
+def analyze_sequence(file_obj, top_kmers=10, fasta_text="", window_size=500):
+    """
+    Analyzes the entire genome, returning classification, full-genome heatmap,
+    top k-mer bar plot, and identifies subregions with strongest positive/negative push.
+    """
     # Handle input
     if fasta_text.strip():
         text = fasta_text.strip()
             with open(file_obj, 'r') as f:
                 text = f.read()
         except Exception as e:
+            return (f"Error reading file: {str(e)}", None, None, None, None, None)
     else:
+        return ("Please provide a FASTA sequence.", None, None, None, None, None)
     # Parse FASTA
     sequences = parse_fasta(text)
     if not sequences:
+        return ("No valid FASTA sequences found.", None, None, None, None, None)
     header, seq = sequences[0]
         model.load_state_dict(torch.load('model.pt', map_location=device))
         scaler = joblib.load('scaler.pkl')
     except Exception as e:
+        return (f"Error loading model: {str(e)}", None, None, None, None, None)
     # Vectorize + scale
     freq_vector = sequence_to_kmer_vector(seq)
     classification = "Human" if prob_human > 0.5 else "Non-human"
     confidence = max(prob_human, prob_nonhuman)
+    # Per-base SHAP
+    shap_means = compute_positionwise_scores(seq, shap_values, k=4)
+    # Find the most "human-pushing" region
+    (max_start, max_end, max_avg) = find_extreme_subregion(shap_means, window_size, mode="max")
+    # Find the most "non-human–pushing" region
+    (min_start, min_end, min_avg) = find_extreme_subregion(shap_means, window_size, mode="min")
     # Build results text
     results_text = (
         f"Sequence: {header}\n"
         f"Length: {len(seq):,} bases\n"
         f"Classification: {classification}\n"
         f"Confidence: {confidence:.3f}\n"
+        f"(Human Probability: {prob_human:.3f}, Non-human Probability: {prob_nonhuman:.3f})\n\n"
+        f"---\n"
+        f"**Most Human-Pushing {window_size}-bp Subregion**:\n"
+        f"Start: {max_start}, End: {max_end}, Avg SHAP: {max_avg:.4f}\n\n"
+        f"**Most Non-Human–Pushing {window_size}-bp Subregion**:\n"
+        f"Start: {min_start}, End: {min_end}, Avg SHAP: {min_avg:.4f}"
     )
     # K-mer importance plot
     bar_fig = create_importance_bar_plot(shap_values, kmers, top_kmers)
     bar_img = fig_to_image(bar_fig)
+    # Full-genome SHAP heatmap
     heatmap_fig = plot_linear_heatmap(shap_means, title="Genome-wide SHAP")
     heatmap_img = fig_to_image(heatmap_fig)
     # Return:
+    #   1) results text
+    #   2) k-mer bar image
+    #   3) full-genome heatmap
+    #   4) "state" with { seq, shap_means, header }, for subregion analysis
+    #   5) we also return "most pushing" subregion info if we want
+    #      but for simplicity, we can just keep them in the text.
+    #   6) the sequence header
     state_dict = {
         "seq": seq,
         "shap_means": shap_means
     }
+    return (results_text, bar_img, heatmap_img, state_dict, header, None)
 ###############################################################################
+# 8. SUBREGION ANALYSIS (Gradio Step 2)
 ###############################################################################
 def analyze_subregion(state, header, region_start, region_end):
     negative_fraction = np.mean(region_shap < 0)
     # Simple logic-based interpretation
     if avg_shap > 0.05:
         region_classification = "Likely pushing toward human"
     elif avg_shap < -0.05:
 ###############################################################################
+# 9. BUILD GRADIO INTERFACE
 ###############################################################################
 css = """
 with gr.Blocks(css=css) as iface:
     gr.Markdown("""
     # Virus Host Classifier (with Interactive Region Viewer)
+    **Step 1**: Predict overall viral sequence origin (human vs non-human) and identify extreme regions.
     **Step 2**: Explore subregions to see local SHAP signals, distribution, GC content, etc.
     """)
                     step=1,
                     label="Number of top k-mers to display"
                 )
+                win_size = gr.Slider(
+                    minimum=100,
+                    maximum=5000,
+                    value=500,
+                    step=100,
+                    label="Window size for 'most pushing' subregions"
+                )
                 analyze_btn = gr.Button("Analyze Sequence", variant="primary")
             with gr.Column(scale=2):
                 results_box = gr.Textbox(
+                    label="Classification Results", lines=12, interactive=False
                 )
                 kmer_img = gr.Image(label="Top k-mer SHAP")
                 genome_img = gr.Image(label="Genome-wide SHAP Heatmap")
         # Hidden states that store data for step 2
         seq_state = gr.State()
         header_state = gr.State()
+        # The "analyze_sequence" function returns 6 values, which we map here:
+        # 1) results_text
+        # 2) bar_img
+        # 3) heatmap_img
+        # 4) state_dict
+        # 5) header
+        # 6) None placeholder
         analyze_btn.click(
             analyze_sequence,
+            inputs=[file_input, top_k, text_input, win_size],
+            outputs=[results_box, kmer_img, genome_img, seq_state, header_state, None]
         )
     with gr.Tab("2) Subregion Exploration"):
        - Local SHAP signals (heatmap & histogram)
        - GC content, fraction of bases pushing "human" vs "non-human"
        - Simple logic-based interpretation based on average SHAP
+    5. **Identification of the most 'human-pushing' subregion** (max average SHAP)
+       and the most 'non-human–pushing' subregion (min average SHAP),
+       each of a chosen window size.
     """)
 if __name__ == "__main__":