Spaces:

hiyata
/

HostClassifier

Sleeping

App Files Files Community

hiyata commited on Jan 12, 2025

Commit

6c4adfb

verified ·

1 Parent(s): 87c2305

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -64

app.py CHANGED Viewed

@@ -319,67 +319,109 @@ def analyze_subregion(state, header, region_start, region_end):
 # 9. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
-def normalize_shap_lengths(shap1, shap2, num_points=1000):
     """
-    Normalize SHAP values to relative positions (0-1 scale).
-    Each point represents a relative position in the sequence (e.g., 0.75 = 75% through sequence).
     """
-    # Create relative position arrays (0 to 1)
-    x1 = np.linspace(0, 1, len(shap1))
-    x2 = np.linspace(0, 1, len(shap2))
-    # Create normalized positions for comparison
-    x_norm = np.linspace(0, 1, num_points)
-    # Interpolate both sequences to the normalized positions
-    shap1_interp = np.interp(x_norm, x1, shap1)
-    shap2_interp = np.interp(x_norm, x2, shap2)
-    return shap1_interp, shap2_interp
-def compute_shap_difference(shap1_norm, shap2_norm):
-    """Compute the SHAP difference between normalized sequences"""
-    return shap2_norm - shap1_norm
-def plot_comparative_heatmap(shap_diff, title="SHAP Difference Heatmap"):
     """
-    Plot heatmap using relative positions (0-100%)
     """
-    heatmap_data = shap_diff.reshape(1, -1)
-    extent = max(abs(np.min(shap_diff)), abs(np.max(shap_diff)))
-    fig, ax = plt.subplots(figsize=(12, 1.8))
-    cmap = get_zero_centered_cmap()
-    cax = ax.imshow(heatmap_data, aspect='auto', cmap=cmap, vmin=-extent, vmax=extent)
-    # Create percentage-based x-axis ticks
-    num_ticks = 5
-    tick_positions = np.linspace(0, shap_diff.shape[0]-1, num_ticks)
-    tick_labels = [f"{int(x*100)}%" for x in np.linspace(0, 1, num_ticks)]
-    ax.set_xticks(tick_positions)
-    ax.set_xticklabels(tick_labels)
-    cbar = plt.colorbar(cax, orientation='horizontal', pad=0.25, aspect=40, shrink=0.8)
-    cbar.ax.tick_params(labelsize=8)
-    cbar.set_label('SHAP Difference (Seq2 - Seq1)', fontsize=9, labelpad=5)
-    ax.set_yticks([])
-    ax.set_xlabel('Relative Position in Sequence', fontsize=10)
-    ax.set_title(title, pad=10)
-    plt.subplots_adjust(bottom=0.25, left=0.05, right=0.95)
-    return fig
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
-    Compare two sequences using relative positions (0-1 scale)
     """
-    # Analyze first sequence
     res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
     if isinstance(res1[0], str) and "Error" in res1[0]:
         return (f"Error in sequence 1: {res1[0]}", None, None)
-    # Analyze second sequence
     res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
     if isinstance(res2[0], str) and "Error" in res2[0]:
         return (f"Error in sequence 2: {res2[0]}", None, None)
@@ -387,53 +429,67 @@ def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     shap1 = res1[3]["shap_means"]
     shap2 = res2[3]["shap_means"]
-    # Normalize to relative positions
-    shap1_norm, shap2_norm = normalize_shap_lengths(shap1, shap2)
-    shap_diff = compute_shap_difference(shap1_norm, shap2_norm)
     # Calculate statistics
     avg_diff = np.mean(shap_diff)
     std_diff = np.std(shap_diff)
     max_diff = np.max(shap_diff)
     min_diff = np.min(shap_diff)
-    threshold = 0.05
-    substantial_diffs = np.abs(shap_diff) > threshold
     frac_different = np.mean(substantial_diffs)
-    # Format output text
-    len1_formatted = "{:,}".format(len(shap1))
-    len2_formatted = "{:,}".format(len(shap2))
-    classification1 = res1[0].split('Classification: ')[1].split('\n')[0].strip()
-    classification2 = res2[0].split('Classification: ')[1].split('\n')[0].strip()
     comparison_text = (
         "Sequence Comparison Results:\n"
-        f"Sequence 1: {res1[4]} (Length: {len1_formatted} bases)\n"
-        f"Classification: {classification1}\n\n"
-        f"Sequence 2: {res2[4]} (Length: {len2_formatted} bases)\n"
-        f"Classification: {classification2}\n\n"
-        "Comparison Statistics:\n"
         f"Average SHAP difference: {avg_diff:.4f}\n"
         f"Standard deviation: {std_diff:.4f}\n"
         f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
         f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
-        f"Fraction of positions with substantial differences: {frac_different:.2%}\n\n"
-        "Note: Comparisons shown at relative positions (0-100%) in each sequence\n"
         "Interpretation:\n"
-        "- Red regions: Sequence 2 is more human-like at that relative position\n"
-        "- Blue regions: Sequence 1 is more human-like at that relative position\n"
         "- White regions: Similar between sequences"
     )
     # Generate visualizations
-    heatmap_fig = plot_comparative_heatmap(shap_diff)
     heatmap_img = fig_to_image(heatmap_fig)
-    hist_fig = plot_shap_histogram(shap_diff, title="Distribution of SHAP Differences")
     hist_img = fig_to_image(hist_fig)
     return comparison_text, heatmap_img, hist_img
 ###############################################################################
 # 10. BUILD GRADIO INTERFACE
 ###############################################################################

 # 9. COMPARISON ANALYSIS FUNCTIONS
 ###############################################################################
+def calculate_adaptive_parameters(len1, len2):
     """
+    Calculate adaptive parameters based on sequence lengths and their difference.
+    Returns:
+        tuple: (num_points, smooth_window, resolution_factor)
     """
+    length_diff = abs(len1 - len2)
+    max_length = max(len1, len2)
+    length_ratio = min(len1, len2) / max_length
+    # Base number of points scales with sequence length
+    base_points = min(2000, max(500, max_length // 100))
+    # Adjust resolution based on length difference
+    if length_diff < 500:
+        resolution_factor = 2.0  # Higher resolution for very similar sequences
+        num_points = min(3000, base_points * 2)
+        smooth_window = max(10, length_diff // 50)  # Minimal smoothing
+    elif length_diff < 5000:
+        resolution_factor = 1.5
+        num_points = min(2000, base_points * 1.5)
+        smooth_window = max(20, length_diff // 100)
+    elif length_diff < 50000:
+        resolution_factor = 1.0
+        num_points = base_points
+        smooth_window = max(50, length_diff // 200)
+    else:
+        # For very large differences, reduce resolution but increase smoothing
+        resolution_factor = 0.75
+        num_points = max(500, base_points // 2)
+        smooth_window = max(100, length_diff // 500)
+    # Adjust window size based on length ratio
+    smooth_window = int(smooth_window * (1 + (1 - length_ratio)))
+    return int(num_points), int(smooth_window), resolution_factor
+def sliding_window_smooth(values, window_size=50):
     """
+    Apply sliding window smoothing with edge handling.
+    Uses exponential decay at edges to reduce boundary effects.
     """
+    if window_size < 3:
+        return values
+    window = np.ones(window_size)
+    # Create exponential decay at edges
+    decay = np.exp(-np.linspace(0, 3, window_size // 2))
+    window[:window_size // 2] = decay
+    window[-(window_size // 2):] = decay[::-1]
+    # Normalize window
+    window = window / window.sum()
+    # Apply convolution
+    smoothed = np.convolve(values, window, mode='valid')
+    # Handle edges
+    pad_size = len(values) - len(smoothed)
+    pad_left = pad_size // 2
+    pad_right = pad_size - pad_left
+    # Use actual values at edges instead of padding
+    result = np.zeros_like(values)
+    result[pad_left:-pad_right] = smoothed
+    result[:pad_left] = values[:pad_left]  # Keep original values at start
+    result[-pad_right:] = values[-pad_right:]  # Keep original values at end
+    return result
+def normalize_shap_lengths(shap1, shap2, num_points=1000, smooth_window=50):
+    """
+    Normalize and smooth SHAP values with dynamic adaptation.
+    """
+    # Calculate adaptive parameters
+    num_points, smooth_window, _ = calculate_adaptive_parameters(len(shap1), len(shap2))
+    # Apply initial smoothing
+    shap1_smooth = sliding_window_smooth(shap1, smooth_window)
+    shap2_smooth = sliding_window_smooth(shap2, smooth_window)
+    # Create relative positions
+    x1 = np.linspace(0, 1, len(shap1_smooth))
+    x2 = np.linspace(0, 1, len(shap2_smooth))
+    x_norm = np.linspace(0, 1, num_points)
+    # Interpolate smoothed values
+    shap1_interp = np.interp(x_norm, x1, shap1_smooth)
+    shap2_interp = np.interp(x_norm, x2, shap2_smooth)
+    return shap1_interp, shap2_interp, smooth_window
 def analyze_sequence_comparison(file1, file2, fasta1="", fasta2=""):
     """
+    Fully dynamic sequence comparison with adaptive parameters.
     """
+    # Analyze sequences
     res1 = analyze_sequence(file1, top_kmers=10, fasta_text=fasta1, window_size=500)
     if isinstance(res1[0], str) and "Error" in res1[0]:
         return (f"Error in sequence 1: {res1[0]}", None, None)
     res2 = analyze_sequence(file2, top_kmers=10, fasta_text=fasta2, window_size=500)
     if isinstance(res2[0], str) and "Error" in res2[0]:
         return (f"Error in sequence 2: {res2[0]}", None, None)
     shap1 = res1[3]["shap_means"]
     shap2 = res2[3]["shap_means"]
+    # Get sequence properties
+    len1, len2 = len(shap1), len(shap2)
+    length_diff = abs(len1 - len2)
+    length_ratio = min(len1, len2) / max(len1, len2)
+    # Get normalized values with adaptive parameters
+    shap1_norm, shap2_norm, smooth_window = normalize_shap_lengths(shap1, shap2)
+    shap_diff = shap2_norm - shap1_norm
+    # Calculate adaptive threshold
+    base_threshold = 0.05
+    adaptive_threshold = base_threshold * (1 + (1 - length_ratio))
+    if length_diff > 50000:
+        adaptive_threshold *= 1.5  # More forgiving for very large differences
     # Calculate statistics
     avg_diff = np.mean(shap_diff)
     std_diff = np.std(shap_diff)
     max_diff = np.max(shap_diff)
     min_diff = np.min(shap_diff)
+    substantial_diffs = np.abs(shap_diff) > adaptive_threshold
     frac_different = np.mean(substantial_diffs)
+    # Format detailed output
     comparison_text = (
         "Sequence Comparison Results:\n"
+        f"Sequence 1: {res1[4]} (Length: {len1:,} bases)\n"
+        f"Classification: {res1[0].split('Classification: ')[1].split('\n')[0].strip()}\n\n"
+        f"Sequence 2: {res2[4]} (Length: {len2:,} bases)\n"
+        f"Classification: {res2[0].split('Classification: ')[1].split('\n')[0].strip()}\n\n"
+        f"Comparison Parameters:\n"
+        f"Length Difference: {length_diff:,} bases\n"
+        f"Length Ratio: {length_ratio:.3f}\n"
+        f"Smoothing Window: {smooth_window} points\n"
+        f"Adaptive Threshold: {adaptive_threshold:.3f}\n\n"
+        "Statistics:\n"
         f"Average SHAP difference: {avg_diff:.4f}\n"
         f"Standard deviation: {std_diff:.4f}\n"
         f"Max difference: {max_diff:.4f} (Seq2 more human-like)\n"
         f"Min difference: {min_diff:.4f} (Seq1 more human-like)\n"
+        f"Fraction with substantial differences: {frac_different:.2%}\n\n"
+        "Note: All parameters automatically adjusted based on sequence properties\n"
         "Interpretation:\n"
+        "- Red regions: Sequence 2 more human-like\n"
+        "- Blue regions: Sequence 1 more human-like\n"
         "- White regions: Similar between sequences"
     )
     # Generate visualizations
+    heatmap_fig = plot_comparative_heatmap(
+        shap_diff,
+        title=f"SHAP Difference Heatmap (window: {smooth_window})"
+    )
     heatmap_img = fig_to_image(heatmap_fig)
+    # Adaptive number of bins based on data
+    num_bins = max(20, min(50, int(np.sqrt(len(shap_diff)))))
+    hist_fig = plot_shap_histogram(shap_diff, num_bins=num_bins)
     hist_img = fig_to_image(hist_fig)
     return comparison_text, heatmap_img, hist_img
 ###############################################################################
 # 10. BUILD GRADIO INTERFACE
 ###############################################################################