Spaces:

hiyata
/

HostClassifier

Running

App Files Files Community

hiyata commited on Jan 12, 2025

Commit

87c2305

verified ·

1 Parent(s): 77621ec

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -21

app.py CHANGED Viewed

@@ -320,38 +320,65 @@ def analyze_subregion(state, header, region_start, region_end):
 ###############################################################################
 def normalize_shap_lengths(shap1, shap2, num_points=1000):
     x1 = np.linspace(0, 1, len(shap1))
     x2 = np.linspace(0, 1, len(shap2))
-    f1 = interp1d(x1, shap1, kind='linear')
-    f2 = interp1d(x2, shap2, kind='linear')
-    x_new = np.linspace(0, 1, num_points)
-    shap1_norm = f1(x_new)
-    shap2_norm = f2(x_new)
-    return shap1_norm, shap2_norm
 def compute_shap_difference(shap1_norm, shap2_norm):
     return shap2_norm - shap1_norm
 def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     heatmap_data = shap_diff.reshape(1, -1)
     extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
-    cmap = get_zero_centered_cmap()
     fig, ax = plt.subplots(figsize=(12, 1.8))
     cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
     cbar.ax.tick_params(labelsize=8)
     cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
     ax.set_yticks([])
-    ax.set_xlabel('Normalized Position (0-100%)', fontsize=10)
     ax.set_title(title, pad=10)
     plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
     return fig
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     # Analyze first sequence
     res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
     if isinstance(res1[0], str) and "Error" in res1[0]:
         return (f"Error in sequence 1: {res1[0]}", None, None)
     # Analyze second sequence
     res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
     if isinstance(res2[0], str) and "Error" in res2[0]:
@@ -359,46 +386,52 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     shap1 = res1[3]["shap_means"]
     shap2 = res2[3]["shap_means"]
     shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
     shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
     avg_diff = np.mean(shap_diff)
     std_diff = np.std(shap_diff)
     max_diff = np.max(shap_diff)
     min_diff = np.min(shap_diff)
     threshold = 0.05
     substantial_diffs = np.abs(shap_diff) > threshold
     frac_different = np.mean(substantial_diffs)
-    classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
-    classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
     len1_formatted = "{:,}".format(len(shap1))
     len2_formatted = "{:,}".format(len(shap2))
-    frac_formatted = "{:.2%}".format(frac_different)
     comparison_text = (
         "Sequence Comparison Results:\n"
-        f"Sequence 1: {res1[4]}\n"
-        f"Length: {len1_formatted} bases\n"
         f"Classification: {classification1}\n\n"
-        f"Sequence 2: {res2[4]}\n"
-        f"Length: {len2_formatted} bases\n"
         f"Classification: {classification2}\n\n"
         "Comparison Statistics:\n"
         f"Average SHAP difference: {avg_diff:.4f}\n"
         f"Standard deviation: {std_diff:.4f}\n"
         f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
         f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
-        f"Fraction of positions with substantial differences: {frac_formatted}\n\n"
         "Interpretation:\n"
-        "Positive values (red) indicate regions where Sequence 2 is more 'human-like'\n"
-        "Negative values (blue) indicate regions where Sequence 1 is more 'human-like'"
     )
     heatmap_fig = plot_comparative_heatmap(shap_diff)
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(shap_diff, title="Distribution of SHAP Differences")
     hist_img = fig_to_image(hist_fig)
     return comparison_text, heatmap_img, hist_img
 ###############################################################################

 ###############################################################################
 def normalize_shap_lengths(shap1, shap2, num_points=1000):
+    """
+    Normalize SHAP values to relative positions (0-1 scale).
+    Each point represents a relative position in the sequence (e.g., 0.75 = 75% through sequence).
+    """
+    # Create relative position arrays (0 to 1)
     x1 = np.linspace(0, 1, len(shap1))
     x2 = np.linspace(0, 1, len(shap2))
+    # Create normalized positions for comparison
+    x_norm = np.linspace(0, 1, num_points)
+    # Interpolate both sequences to the normalized positions
+    shap1_interp = np.interp(x_norm, x1, shap1)
+    shap2_interp = np.interp(x_norm, x2, shap2)
+    return shap1_interp, shap2_interp
 def compute_shap_difference(shap1_norm, shap2_norm):
+    """Compute the SHAP difference between normalized sequences"""
     return shap2_norm - shap1_norm
 def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
+    """
+    Plot heatmap using relative positions (0-100%)
+    """
     heatmap_data = shap_diff.reshape(1, -1)
     extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
     fig, ax = plt.subplots(figsize=(12, 1.8))
+    cmap = get_zero_centered_cmap()
     cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
+    # Create percentage-based x-axis ticks
+    num_ticks = 5
+    tick_positions = np.linspace(0, shap_diff.shape[0]-1, num_ticks)
+    tick_labels = [f"{int(x*100)}%" for x in np.linspace(0, 1, num_ticks)]
+    ax.set_xticks(tick_positions)
+    ax.set_xticklabels(tick_labels)
     cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
     cbar.ax.tick_params(labelsize=8)
     cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
     ax.set_yticks([])
+    ax.set_xlabel('Relative Position in Sequence', fontsize=10)
     ax.set_title(title, pad=10)
     plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
     return fig
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
+    """
+    Compare two sequences using relative positions (0-1 scale)
+    """
     # Analyze first sequence
     res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
     if isinstance(res1[0], str) and "Error" in res1[0]:
         return (f"Error in sequence 1: {res1[0]}", None, None)
     # Analyze second sequence
     res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
     if isinstance(res2[0], str) and "Error" in res2[0]:
     shap1 = res1[3]["shap_means"]
     shap2 = res2[3]["shap_means"]
+    # Normalize to relative positions
     shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
     shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
+    # Calculate statistics
     avg_diff = np.mean(shap_diff)
     std_diff = np.std(shap_diff)
     max_diff = np.max(shap_diff)
     min_diff = np.min(shap_diff)
     threshold = 0.05
     substantial_diffs = np.abs(shap_diff) > threshold
     frac_different = np.mean(substantial_diffs)
+    # Format output text
     len1_formatted = "{:,}".format(len(shap1))
     len2_formatted = "{:,}".format(len(shap2))
+    classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
+    classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
     comparison_text = (
         "Sequence Comparison Results:\n"
+        f"Sequence 1: {res1[4]} (Length: {len1_formatted} bases)\n"
         f"Classification: {classification1}\n\n"
+        f"Sequence 2: {res2[4]} (Length: {len2_formatted} bases)\n"
         f"Classification: {classification2}\n\n"
         "Comparison Statistics:\n"
         f"Average SHAP difference: {avg_diff:.4f}\n"
         f"Standard deviation: {std_diff:.4f}\n"
         f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
         f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
+        f"Fraction of positions with substantial differences: {frac_different:.2%}\n\n"
+        "Note: Comparisons shown at relative positions (0-100%) in each sequence\n"
         "Interpretation:\n"
+        "- Red regions: Sequence 2 is more human-like at that relative position\n"
+        "- Blue regions: Sequence 1 is more human-like at that relative position\n"
+        "- White regions: Similar between sequences"
     )
+    # Generate visualizations
     heatmap_fig = plot_comparative_heatmap(shap_diff)
     heatmap_img = fig_to_image(heatmap_fig)
     hist_fig = plot_shap_histogram(shap_diff, title="Distribution of SHAP Differences")
     hist_img = fig_to_image(hist_fig)
     return comparison_text, heatmap_img, hist_img
 ###############################################################################