Spaces:

genomenet
/

crispr-array-detection

Sleeping

genomenet Claude Opus 4.5 commited on Apr 24

Commit

3cc5297

1 Parent(s): 6b4e599

Minimalist monochrome redesign with Geist Mono font

- Monochrome/grayscale color scheme throughout
- Geist Mono font for code and sequence display
- Simplified UI text: lowercase labels, minimal descriptions
- Grayscale Plotly charts with subtle styling
- Minimal header: "crispr-detect" with brief description
- Compact API and About tabs
- Zinc-based Gradio theme

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (3) hide show

app.py +420 -292
inference/inference.py +35 -7
inference/tokenizer.py +31 -5

app.py CHANGED Viewed

@@ -3,7 +3,10 @@ CRISPR Array Detection - HuggingFace Spaces App
 """
 import os
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 import gradio as gr
 import numpy as np
@@ -23,24 +26,100 @@ from inference.model_loader import get_model, warmup_model, get_gpu_status
 from inference.tokenizer import validate_sequence, strip_fasta_header
 from inference.inference import detect_crispr_regions
-# Custom CSS for better fonts
 CUSTOM_CSS = """
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=JetBrains+Mono&display=swap');
 * {
-    font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif !important;
 }
-code, pre, .code, textarea {
-    font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
 }
-h1, h2, h3 {
-    font-weight: 600 !important;
 }
 .gradio-container {
     max-width: 1200px !important;
 }
 """
@@ -68,6 +147,100 @@ EMBEDDING_CRISPR_EXAMPLE = """GACAGGTACAAGAAGGAGTATGCATCAATGTGGTCGTGTGGAACAAACGC
 EMBEDDING_RANDOM_EXAMPLE = """ATGCGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCT"""
 def create_prediction_plot(positions, probabilities, threshold=0.3, regions=None):
     """Create a matplotlib figure showing the prediction curve (for PNG/PDF export)."""
     fig, ax = plt.subplots(figsize=(12, 4))
@@ -89,7 +262,7 @@ def create_prediction_plot(positions, probabilities, threshold=0.3, regions=None
     ax.set_ylabel('CRISPR Probability')
     ax.set_title('CRISPR Array Detection Score')
     ax.set_ylim(0, 1)
-    ax.set_xlim(0, max(positions) if positions else 1000)
     ax.legend(loc='upper right')
     ax.grid(True, alpha=0.3)
@@ -101,102 +274,96 @@ def create_interactive_prediction_plot(positions, probabilities, threshold=0.3,
     """Create an interactive Plotly figure showing the prediction curve with minimap."""
     fig = go.Figure()
     max_pos = max(positions) if positions else 1000
-    # Main probability curve with fill
     fig.add_trace(go.Scatter(
         x=positions,
         y=probabilities,
         mode='lines',
-        name='Prediction Score',
-        line=dict(color='#2563eb', width=1.5),
         fill='tozeroy',
-        fillcolor='rgba(37, 99, 235, 0.15)',
         hovertemplate='Position: %{x:,} bp<br>Score: %{y:.3f}<extra></extra>'
     ))
-    # Add threshold line
     fig.add_hline(
         y=threshold,
         line_dash="dash",
-        line_color="#dc2626",
-        annotation_text=f"Threshold ({threshold})",
         annotation_position="top right",
-        annotation_font_size=11
     )
-    # Highlight detected CRISPR regions
     if regions:
         for r in regions:
             fig.add_vrect(
                 x0=r['start'], x1=r['end'],
-                fillcolor="rgba(220, 38, 38, 0.12)",
                 layer="below",
                 line_width=1,
-                line_color="rgba(220, 38, 38, 0.3)",
-                annotation_text=f"CRISPR {r['region_id']}",
                 annotation_position="top left",
-                annotation_font_size=10,
-                annotation_font_color="#dc2626"
             )
     fig.update_layout(
-        title=dict(
-            text='CRISPR Array Detection',
-            font=dict(size=14, color='#1f2937'),
-            x=0.5,
-            xanchor='center'
-        ),
         xaxis=dict(
-            title='Position (bp)',
-            range=[0, max_pos],
-            gridcolor='#e5e7eb',
             showgrid=True,
             zeroline=False,
-            # Rangeslider for minimap navigation
             rangeslider=dict(
                 visible=True,
-                thickness=0.08,
-                bgcolor='#f3f4f6',
-                bordercolor='#d1d5db',
                 borderwidth=1
             ),
-            # Range selector buttons for quick zoom
             rangeselector=dict(
                 buttons=list([
                     dict(count=500, label="500bp", step="all", stepmode="backward"),
                     dict(count=1000, label="1kb", step="all", stepmode="backward"),
                     dict(count=5000, label="5kb", step="all", stepmode="backward"),
-                    dict(step="all", label="Full")
                 ]),
-                bgcolor='#f9fafb',
-                bordercolor='#d1d5db',
-                font=dict(size=10),
                 x=0,
-                y=1.15
             )
         ),
         yaxis=dict(
-            title='CRISPR Probability',
             range=[0, 1.05],
-            gridcolor='#e5e7eb',
             showgrid=True,
             zeroline=False,
             tickformat='.1f'
         ),
         hovermode='x unified',
-        showlegend=True,
-        legend=dict(
-            yanchor="top", y=0.99,
-            xanchor="right", x=0.99,
-            bgcolor='rgba(255,255,255,0.8)',
-            bordercolor='#e5e7eb',
-            borderwidth=1
-        ),
-        height=480,
-        plot_bgcolor='white',
-        paper_bgcolor='white',
-        margin=dict(t=80, b=60)
     )
     return fig
@@ -221,9 +388,8 @@ def create_embedding_heatmap(embedding, title="Sequence Embedding", cols=30):
     # Create figure
     fig, ax = plt.subplots(figsize=(14, max(3, rows * 0.25)))
-    # Use diverging colormap centered at 0
-    vmax = max(abs(np.nanmin(embedding)), abs(np.nanmax(embedding)))
-    norm = TwoSlopeNorm(vmin=-vmax, vcenter=0, vmax=vmax)
     im = ax.imshow(grid, cmap='RdBu_r', norm=norm, aspect='auto')
@@ -262,9 +428,8 @@ def create_trajectory_heatmap(embeddings, title="Embedding Trajectory"):
     fig, ax = plt.subplots(figsize=(14, max(4, n_windows * 0.3)))
-    # Use diverging colormap
-    vmax = max(abs(embeddings.min()), abs(embeddings.max()))
-    norm = TwoSlopeNorm(vmin=-vmax, vcenter=0, vmax=vmax)
     im = ax.imshow(embeddings, cmap='RdBu_r', norm=norm, aspect='auto')
@@ -442,17 +607,17 @@ def create_sequence_cluster_map(cluster_labels, stride=100, window_size=1000):
 def create_interactive_state_plot(embeddings, n_clusters=8, stride=100, use_3d=False):
     """
-    Create interactive Plotly State-Dynamic Plot with 2D or 3D UMAP.
     """
     embeddings = np.array(embeddings)
     n_windows, n_dims = embeddings.shape
     if n_windows < 5:
-        # Not enough data
         fig = go.Figure()
         fig.add_annotation(text="Need longer sequence (minimum ~1500 bp)",
                           xref="paper", yref="paper", x=0.5, y=0.5,
-                          showarrow=False, font=dict(size=16))
         return fig
     # UMAP reduction
@@ -477,70 +642,74 @@ def create_interactive_state_plot(embeddings, n_clusters=8, stride=100, use_3d=F
     hover_text = [f"Window {i}<br>Position: {pos}-{pos+1000} bp<br>Cluster: {c}"
                   for i, (pos, c) in enumerate(zip(positions, cluster_labels))]
-    # Color palette
-    colors = px.colors.qualitative.Set1[:n_clusters]
-    color_map = [colors[c] for c in cluster_labels]
     if use_3d:
-        # 3D Plot
         fig = go.Figure()
-        # Add trajectory line
         fig.add_trace(go.Scatter3d(
             x=embedding_reduced[:, 0],
             y=embedding_reduced[:, 1],
             z=embedding_reduced[:, 2],
             mode='lines',
-            line=dict(color='rgba(100,100,100,0.3)', width=2),
             name='Trajectory',
             hoverinfo='skip'
         ))
-        # Add points colored by cluster
         fig.add_trace(go.Scatter3d(
             x=embedding_reduced[:, 0],
             y=embedding_reduced[:, 1],
             z=embedding_reduced[:, 2],
             mode='markers',
             marker=dict(
-                size=6,
                 color=cluster_labels,
-                colorscale='Set1',
-                opacity=0.8,
-                line=dict(width=1, color='white')
             ),
             text=hover_text,
             hovertemplate='%{text}<extra></extra>',
             name='Windows'
         ))
-        # Mark start and end
         fig.add_trace(go.Scatter3d(
             x=[embedding_reduced[0, 0]],
             y=[embedding_reduced[0, 1]],
             z=[embedding_reduced[0, 2]],
             mode='markers',
-            marker=dict(size=12, color='green', symbol='diamond'),
-            name="Start (5')"
         ))
         fig.add_trace(go.Scatter3d(
             x=[embedding_reduced[-1, 0]],
             y=[embedding_reduced[-1, 1]],
             z=[embedding_reduced[-1, 2]],
             mode='markers',
-            marker=dict(size=12, color='red', symbol='square'),
-            name="End (3')"
         ))
         fig.update_layout(
-            title=dict(text='3D State-Dynamic Plot (drag to rotate)', font=dict(size=16)),
             scene=dict(
-                xaxis_title='UMAP 1',
-                yaxis_title='UMAP 2',
-                zaxis_title='UMAP 3'
             ),
-            height=600,
-            showlegend=True
         )
     else:
@@ -549,7 +718,7 @@ def create_interactive_state_plot(embeddings, n_clusters=8, stride=100, use_3d=F
             rows=2, cols=2,
             specs=[[{"type": "scatter"}, {"type": "scatter"}],
                    [{"type": "scatter", "colspan": 2}, None]],
-            subplot_titles=('By Cluster', 'By Position', 'Sequence Map'),
             row_heights=[0.6, 0.4],
             vertical_spacing=0.12
         )
@@ -559,7 +728,7 @@ def create_interactive_state_plot(embeddings, n_clusters=8, stride=100, use_3d=F
             x=embedding_reduced[:, 0],
             y=embedding_reduced[:, 1],
             mode='lines',
-            line=dict(color='rgba(100,100,100,0.2)', width=1),
             hoverinfo='skip',
             showlegend=False
         ), row=1, col=1)
@@ -570,36 +739,37 @@ def create_interactive_state_plot(embeddings, n_clusters=8, stride=100, use_3d=F
                 x=embedding_reduced[mask, 0],
                 y=embedding_reduced[mask, 1],
                 mode='markers',
-                marker=dict(size=8, color=colors[c], opacity=0.8,
-                           line=dict(width=1, color='white')),
                 text=[hover_text[i] for i in np.where(mask)[0]],
                 hovertemplate='%{text}<extra></extra>',
-                name=f'Cluster {c}',
                 legendgroup=f'c{c}'
             ), row=1, col=1)
         # Start/End markers
         fig.add_trace(go.Scatter(
             x=[embedding_reduced[0, 0]], y=[embedding_reduced[0, 1]],
-            mode='markers', marker=dict(size=15, color='green', symbol='triangle-up',
-                                        line=dict(width=2, color='black')),
-            name="Start (5')", showlegend=True
         ), row=1, col=1)
         fig.add_trace(go.Scatter(
             x=[embedding_reduced[-1, 0]], y=[embedding_reduced[-1, 1]],
-            mode='markers', marker=dict(size=15, color='red', symbol='square',
-                                        line=dict(width=2, color='black')),
-            name="End (3')", showlegend=True
         ), row=1, col=1)
-        # Right plot: by position
         fig.add_trace(go.Scatter(
             x=embedding_reduced[:, 0],
             y=embedding_reduced[:, 1],
             mode='lines+markers',
-            line=dict(color='rgba(100,100,100,0.3)', width=1),
-            marker=dict(size=8, color=np.arange(n_windows), colorscale='Viridis',
-                       showscale=True, colorbar=dict(title='Window', x=1.02)),
             text=hover_text,
             hovertemplate='%{text}<extra></extra>',
             showlegend=False
@@ -607,47 +777,59 @@ def create_interactive_state_plot(embeddings, n_clusters=8, stride=100, use_3d=F
         fig.add_trace(go.Scatter(
             x=[embedding_reduced[0, 0]], y=[embedding_reduced[0, 1]],
-            mode='markers', marker=dict(size=15, color='green', symbol='triangle-up',
-                                        line=dict(width=2, color='black')),
             showlegend=False
         ), row=1, col=2)
         fig.add_trace(go.Scatter(
             x=[embedding_reduced[-1, 0]], y=[embedding_reduced[-1, 1]],
-            mode='markers', marker=dict(size=15, color='red', symbol='square',
-                                        line=dict(width=2, color='black')),
             showlegend=False
         ), row=1, col=2)
-        # Bottom: sequence map (horizontal bar)
         window_size = 1000
         for i, (cluster, pos) in enumerate(zip(cluster_labels, positions)):
             fig.add_trace(go.Scatter(
                 x=[pos, pos + window_size, pos + window_size, pos, pos],
                 y=[0, 0, 1, 1, 0],
                 fill='toself',
-                fillcolor=colors[cluster],
                 line=dict(width=0),
-                opacity=0.7,
                 hoverinfo='text',
                 text=f'Position {pos}-{pos+window_size} bp<br>Cluster {cluster}',
                 showlegend=False
             ), row=2, col=1)
-        fig.update_xaxes(title_text='UMAP 1', row=1, col=1)
-        fig.update_yaxes(title_text='UMAP 2', row=1, col=1)
-        fig.update_xaxes(title_text='UMAP 1', row=1, col=2)
-        fig.update_yaxes(title_text='UMAP 2', row=1, col=2)
-        fig.update_xaxes(title_text='Position (bp)', row=2, col=1)
         fig.update_yaxes(visible=False, row=2, col=1)
         fig.update_layout(
-            title=dict(text='Interactive State-Dynamic Plot (hover for details, zoom/pan available)',
-                      font=dict(size=14)),
-            height=700,
             showlegend=True,
-            legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
         )
     return fig
@@ -655,15 +837,23 @@ def parse_fasta_file(file_path):
     """Parse a FASTA file and return the sequence."""
     if file_path is None:
         return None
-    with open(file_path, 'r') as f:
         content = f.read()
-    return strip_fasta_header(content.strip())
-def create_gff3_export(regions, sequence_length, sequence_id="input_sequence"):
     """Create GFF3 format annotation file for detected CRISPR regions."""
-    import tempfile
-    gff_path = os.path.join(tempfile.gettempdir(), "crispr_regions.gff3")
     with open(gff_path, 'w') as f:
         # GFF3 header
@@ -673,59 +863,52 @@ def create_gff3_export(regions, sequence_length, sequence_id="input_sequence"):
         for r in regions:
             # GFF3 format: seqid source type start end score strand phase attributes
             attributes = f"ID=CRISPR_{r['region_id']};Name=CRISPR_array_{r['region_id']};score={r['mean_score']:.3f}"
-            f.write(f"{sequence_id}\tCRISPR-BERT\tCRISPR_array\t{r['start']+1}\t{r['end']}\t{r['mean_score']:.3f}\t.\t.\t{attributes}\n")
     return gff_path
 def create_sequence_viewer_html(sequence, positions, probabilities, threshold=0.3, chunk_size=100):
-    """Create an HTML visualization of the sequence with color-coded scores."""
-    # Interpolate scores to per-nucleotide level
-    import numpy as np
     seq_len = len(sequence)
-    per_base_scores = np.zeros(seq_len)
-    # Map window scores to positions
-    for i, (pos, prob) in enumerate(zip(positions, probabilities)):
-        start = pos
-        end = min(pos + 1000, seq_len)  # window size
-        # Average with existing scores for overlapping windows
-        for j in range(start, end):
-            if per_base_scores[j] == 0:
-                per_base_scores[j] = prob
-            else:
-                per_base_scores[j] = (per_base_scores[j] + prob) / 2
-    # Generate HTML
-    html_parts = ['<div style="font-family: monospace; font-size: 12px; line-height: 1.8; background: #f8f9fa; padding: 15px; border-radius: 8px; max-height: 400px; overflow-y: auto;">']
-    html_parts.append('<div style="margin-bottom: 10px; font-family: sans-serif; font-size: 13px;">')
-    html_parts.append('<span style="background: linear-gradient(to right, #3b82f6, #fbbf24, #ef4444); padding: 2px 20px; border-radius: 3px; color: white;">Low → Medium → High CRISPR Score</span>')
-    html_parts.append(f'<span style="margin-left: 15px;">Threshold: {threshold}</span>')
     html_parts.append('</div>')
-    # Process sequence in chunks with position markers
     for chunk_start in range(0, seq_len, chunk_size):
         chunk_end = min(chunk_start + chunk_size, seq_len)
         chunk_seq = sequence[chunk_start:chunk_end]
         chunk_scores = per_base_scores[chunk_start:chunk_end]
         # Position marker
-        html_parts.append(f'<div><span style="color: #666; width: 60px; display: inline-block; font-size: 11px;">{chunk_start+1:,}</span>')
         for i, (base, score) in enumerate(zip(chunk_seq, chunk_scores)):
-            # Color based on score: blue (low) -> yellow (medium) -> red (high)
-            if score < threshold * 0.5:
-                color = "#3b82f6"  # blue
-            elif score < threshold:
-                color = "#fbbf24"  # yellow
-            elif score < threshold * 1.5:
-                color = "#f97316"  # orange
-            else:
-                color = "#ef4444"  # red
-            bg_opacity = min(0.3 + score * 0.7, 1.0)
-            html_parts.append(f'<span style="color: {color}; background-color: rgba(0,0,0,{bg_opacity * 0.1}); font-weight: {"bold" if score >= threshold else "normal"};" title="Pos {chunk_start + i + 1}: {score:.3f}">{base}</span>')
         html_parts.append('</div>')
@@ -957,75 +1140,72 @@ Blue = negative activation, Red = positive activation.
 # Build interface
 with gr.Blocks(title="CRISPR Array Detection") as demo:
     gr.Markdown("""
-# CRISPR Array Detection
-A deep learning approach for identifying CRISPR arrays in prokaryotic genome sequences. This tool employs a 24-layer BERT transformer architecture (~430M parameters) that was pre-trained on metagenomic contigs and complete microbial genomes, then fine-tuned on annotated CRISPR array sequences.
-**Method**: Input sequences are processed using a sliding window approach (1000 bp window, configurable stride). For each window, the model outputs a probability score ∈ [0,1] indicating the likelihood that the central region contains part of a CRISPR array. Overlapping predictions are aggregated to produce per-position scores across the full sequence length.
-**Output**: Detected CRISPR regions are reported with genomic coordinates, mean prediction scores, and can be exported in standard formats (GFF3, CSV) for downstream analysis.
     """)
     with gr.Tab("Prediction"):
         with gr.Row():
             with gr.Column(scale=1):
                 seq_input = gr.Textbox(
-                    label="Input Sequence",
                     placeholder="Paste DNA sequence (FASTA format accepted)...",
                     lines=6,
                     value=FLANKED_CRISPR_EXAMPLE,
-                    info="Minimum length: 1000 bp. Accepts raw sequence or FASTA format."
                 )
                 file_upload = gr.File(
-                    label="Upload FASTA File",
                     file_types=[".fasta", ".fa", ".fna", ".txt"],
                     type="filepath"
                 )
                 with gr.Row():
                     stride_input = gr.Slider(
                         minimum=50, maximum=500, value=100, step=50,
-                        label="Stride (bp)",
-                        info="Step size between consecutive windows. Lower values increase resolution but require more computation."
                     )
                     threshold_input = gr.Slider(
                         minimum=0.1, maximum=0.9, value=0.3, step=0.05,
-                        label="Detection Threshold",
-                        info="Minimum score to classify a region as CRISPR. Lower = more sensitive, higher = more specific."
                     )
                 with gr.Row():
-                    predict_btn = gr.Button("Run Analysis", variant="primary", size="lg")
-                gr.Markdown("**Example sequences:**")
                 with gr.Row():
-                    gr.Button("Flanked CRISPR").click(
                         lambda: FLANKED_CRISPR_EXAMPLE, outputs=seq_input
                     )
-                    gr.Button("E. coli K-12 CRISPR I-E").click(
                         lambda: ECOLI_CRISPR_EXAMPLE, outputs=seq_input
                     )
                 with gr.Row():
-                    gr.Button("CRISPR Array").click(
                         lambda: CRISPR_EXAMPLE, outputs=seq_input
                     )
-                    gr.Button("Negative Control").click(
                         lambda: NON_CRISPR_EXAMPLE, outputs=seq_input
                     )
                 result_summary = gr.Markdown()
-                with gr.Accordion("Export Results", open=False, visible=False) as download_accordion:
-                    gr.Markdown("**Figures:**")
                     with gr.Row():
-                        pred_download_png = gr.File(label="PNG", interactive=False)
-                        pred_download_pdf = gr.File(label="PDF", interactive=False)
-                    gr.Markdown("**Data:**")
                     with gr.Row():
-                        pred_download_csv = gr.File(label="CSV", interactive=False)
-                        pred_download_gff = gr.File(label="GFF3", interactive=False)
                     with gr.Row():
-                        pred_download_summary = gr.File(label="Summary", interactive=False)
             with gr.Column(scale=2):
-                plot_output = gr.Plot(label="Prediction Score Profile")
-                with gr.Accordion("Sequence Viewer", open=False, visible=False) as seq_viewer_accordion:
-                    gr.Markdown("*Color scale: blue (low score) → yellow (medium) → red (high score). Hover over nucleotides for exact values.*")
-                    seq_viewer_html = gr.HTML(label="Color-coded sequence")
                 regions_output = gr.JSON(label="Detected Regions", visible=False)
         # Handle file upload - load content into textbox
@@ -1056,52 +1236,49 @@ A deep learning approach for identifying CRISPR arrays in prokaryotic genome seq
     with gr.Tab("Embeddings"):
         gr.Markdown("""
-### Hidden State Analysis
-Extract and visualize the model's internal representations (embeddings) from the transformer layers. The **State-Dynamics** mode applies UMAP dimensionality reduction to project the 768-dimensional embeddings into 2D/3D space, then performs agglomerative clustering to identify regions with similar activation patterns.
-**Biological interpretation**: In CRISPR arrays, repeat sequences share conserved motifs and should cluster together, while unique spacer sequences form distinct clusters. This creates a characteristic alternating pattern in the sequence map visualization.
 """)
         with gr.Row():
             with gr.Column(scale=1):
                 embed_seq = gr.Textbox(
-                    label="Input Sequence",
                     placeholder="Paste DNA sequence...",
                     lines=6,
                     value=EMBEDDING_CRISPR_EXAMPLE,
-                    info="Longer sequences (>2000 bp) provide better clustering resolution."
                 )
                 embed_mode = gr.Radio(
                     choices=["state-dynamics", "mean", "max", "trajectory"],
                     value="state-dynamics",
-                    label="Visualization Mode",
-                    info="state-dynamics: UMAP clustering | mean/max: pooled embedding | trajectory: per-window heatmap"
                 )
                 use_3d = gr.Checkbox(
-                    label="3D UMAP Projection",
                     value=False,
-                    info="Project embeddings to 3D space (interactive rotation)",
                     visible=True
                 )
                 with gr.Row():
-                    embed_btn = gr.Button("Extract Embeddings", variant="primary")
                 with gr.Row():
-                    gr.Button("CRISPR Example (3kb)").click(
                         lambda: EMBEDDING_CRISPR_EXAMPLE, outputs=embed_seq
                     )
-                    gr.Button("Control Sequence (3kb)").click(
                         lambda: EMBEDDING_RANDOM_EXAMPLE, outputs=embed_seq
                     )
-                gr.Markdown("""
-**Example structure:** 600 bp upstream | CRISPR array (25 repeats + 24 spacers) | 600 bp downstream
-""")
                 embed_summary = gr.Markdown()
-                with gr.Accordion("Export Results", open=False, visible=False) as embed_download_accordion:
                     with gr.Row():
-                        download_png = gr.File(label="PNG", interactive=False)
-                        download_pdf = gr.File(label="PDF", interactive=False)
             with gr.Column(scale=2):
-                embed_plot = gr.Plot(label="Embedding Visualization")
         # Show/hide 3D checkbox based on mode
         embed_mode.change(
@@ -1122,119 +1299,64 @@ Extract and visualize the model's internal representations (embeddings) from the
     with gr.Tab("API"):
         gr.Markdown("""
-### Programmatic Access
-This tool can be accessed programmatically using the Gradio Python client or via HTTP requests.
-#### Python Client
 ```python
 from gradio_client import Client
-# Connect to the API
 client = Client("genomenet/crispr-array-detection")
-# Run prediction
 result = client.predict(
-    sequence="ATGC...",      # DNA sequence (min 1000 bp)
-    stride=100,              # Window stride in bp
-    threshold=0.3,           # Detection threshold
     api_name="/predict"
 )
-# result contains: (plot, summary, regions, png_path, pdf_path, csv_path, summary_path, gff_path, seq_viewer_html)
-```
-#### Extract Embeddings
-```python
 result = client.predict(
     sequence="ATGC...",
-    mode="state-dynamics",   # or "mean", "max", "trajectory"
     use_3d=False,
     api_name="/get_embedding"
 )
 ```
-#### cURL Example
-```bash
-curl -X POST "https://genomenet-crispr-array-detection.hf.space/api/predict" \\
-  -H "Content-Type: application/json" \\
-  -d '{"data": ["ATGCATGC...", 100, 0.3]}'
-```
-#### Output Formats
-| Format | Description |
-|--------|-------------|
-| CSV | Per-position scores: `position, probability, above_threshold` |
-| GFF3 | Standard genome annotation format for detected regions |
-| TXT | Human-readable summary with statistics |
-| PNG/PDF | Publication-ready figures |
-#### Rate Limits
-- Free tier: Standard HuggingFace rate limits apply
-- For high-throughput analysis, consider running the model locally
-#### Local Installation
 ```bash
 git clone https://huggingface.co/spaces/genomenet/crispr-array-detection
-cd crispr-array-detection
-pip install -r requirements.txt
-python app.py
 ```
         """)
     with gr.Tab("About"):
         gr.Markdown("""
-### Model Architecture
-| Component | Specification |
-|-----------|--------------|
-| Base model | BERT (Bidirectional Encoder Representations from Transformers) |
-| Layers | 24 transformer blocks |
-| Hidden size | 768 dimensions |
-| Attention heads | 12 |
-| Parameters | ~430 million |
-| Classification head | Bottleneck architecture |
-### Training
-**Pre-training corpus**: Metagenomic contigs and complete microbial genomes from public databases.
-**Fine-tuning data**: Annotated CRISPR arrays from bacterial and archaeal genomes, including positive examples from CRISPRCasdb and negative examples from non-CRISPR genomic regions.
-**Embedding extraction**: Hidden states are extracted from transformer layer 21 (768 dimensions per position).
-### Parameters
-| Parameter | Range | Default | Description |
-|-----------|-------|---------|-------------|
-| Stride | 50-500 bp | 100 bp | Step size between windows. Lower = higher resolution, more computation |
-| Threshold | 0.1-0.9 | 0.3 | Detection cutoff. Lower = more sensitive, higher = more specific |
-| Window size | Fixed | 1000 bp | Input window for the transformer model |
-### Performance Considerations
-- **GPU recommended**: T4 or better for interactive use
-- **CPU inference**: Functional but slower (~10-30s per analysis)
-- **Memory**: ~2GB GPU memory required
-### Citation
-If you use this tool in your research, please cite:
-> Mu, Z. (2024). Deep Learning-Based CRISPR Array Detection. Master's Thesis, Helmholtz Centre for Infection Research.
-### Acknowledgements
-- Ziyu Mu - Model development (Master's Thesis, HZI BIFO)
-- DFG SPP 2141 "Much more than Defence" (Project MC 172)
-- BMBF de.NBI / GenomeNet
-- Helmholtz Centre for Infection Research (HZI)
         """)
@@ -1246,6 +1368,12 @@ if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        theme=gr.themes.Soft(),
         css=CUSTOM_CSS
     )

 """
 import os
+import html
+import tempfile
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+os.environ.setdefault("MPLCONFIGDIR", os.path.join(tempfile.gettempdir(), "matplotlib"))
 import gradio as gr
 import numpy as np
 from inference.tokenizer import validate_sequence, strip_fasta_header
 from inference.inference import detect_crispr_regions
+MAX_SEQUENCE_LENGTH = int(os.environ.get("MAX_SEQUENCE_LENGTH", "50000"))
+MAX_UPLOAD_BYTES = int(os.environ.get("MAX_UPLOAD_BYTES", str(2 * 1024 * 1024)))
+MAX_SEQUENCE_VIEWER_LENGTH = int(os.environ.get("MAX_SEQUENCE_VIEWER_LENGTH", "20000"))
+QUEUE_MAX_SIZE = int(os.environ.get("GRADIO_QUEUE_MAX_SIZE", "8"))
+# Custom CSS - Minimal monochrome design with Geist fonts
 CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&display=swap');
+@font-face {
+    font-family: 'Geist Mono';
+    src: url('https://cdn.jsdelivr.net/npm/geist@1.2.0/dist/fonts/geist-mono/GeistMono-Regular.woff2') format('woff2');
+    font-weight: 400;
+}
+@font-face {
+    font-family: 'Geist Mono';
+    src: url('https://cdn.jsdelivr.net/npm/geist@1.2.0/dist/fonts/geist-mono/GeistMono-Medium.woff2') format('woff2');
+    font-weight: 500;
+}
 * {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, system-ui, sans-serif !important;
 }
+code, pre, .code, textarea, .prose code {
+    font-family: 'Geist Mono', 'SF Mono', Consolas, monospace !important;
 }
+h1 {
+    font-weight: 500 !important;
+    letter-spacing: -0.02em !important;
+}
+h2, h3, h4 {
+    font-weight: 500 !important;
+    color: #18181b !important;
 }
 .gradio-container {
     max-width: 1200px !important;
+    background: #fafafa !important;
+}
+.gr-button-primary {
+    background: #18181b !important;
+    border: none !important;
+}
+.gr-button-primary:hover {
+    background: #27272a !important;
+}
+.gr-button-secondary {
+    background: #fff !important;
+    border: 1px solid #e4e4e7 !important;
+    color: #18181b !important;
+}
+.gr-panel {
+    border: 1px solid #e4e4e7 !important;
+    background: #fff !important;
+}
+/* Minimal table styling */
+table {
+    border-collapse: collapse !important;
+}
+th, td {
+    border-bottom: 1px solid #e4e4e7 !important;
+    padding: 8px 12px !important;
+}
+th {
+    font-weight: 500 !important;
+    text-transform: uppercase !important;
+    font-size: 11px !important;
+    letter-spacing: 0.05em !important;
+    color: #71717a !important;
+}
+/* Slider styling */
+input[type="range"] {
+    accent-color: #18181b !important;
+}
+/* Tab styling */
+.tab-nav button {
+    font-weight: 400 !important;
+    color: #52525b !important;
+}
+.tab-nav button.selected {
+    color: #18181b !important;
+    border-bottom: 2px solid #18181b !important;
 }
 """
 EMBEDDING_RANDOM_EXAMPLE = """ATGCGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCT"""
+def _count_fasta_records(text: str) -> int:
+    return sum(1 for line in text.splitlines() if line.strip().startswith(">"))
+def normalize_sequence_input(sequence: str) -> tuple[bool, str, str]:
+    """Clean and validate a single-sequence FASTA/raw DNA input."""
+    if sequence is None:
+        return False, "", "Sequence is empty"
+    text = str(sequence).strip()
+    if not text:
+        return False, "", "Sequence is empty"
+    if _count_fasta_records(text) > 1:
+        return False, "", "Multi-FASTA input is not supported. Please submit one sequence at a time."
+    cleaned = strip_fasta_header(text)
+    is_valid, error = validate_sequence(cleaned)
+    if not is_valid:
+        return False, cleaned, error
+    if len(cleaned) > MAX_SEQUENCE_LENGTH:
+        return (
+            False,
+            cleaned,
+            f"Sequence too long: {len(cleaned):,} bp > {MAX_SEQUENCE_LENGTH:,} bp limit",
+        )
+    return True, cleaned, ""
+def validate_stride(stride) -> tuple[bool, int, str]:
+    if isinstance(stride, bool):
+        return False, 0, "Stride must be an integer between 50 and 500 bp"
+    try:
+        if isinstance(stride, float) and not stride.is_integer():
+            raise ValueError
+        stride = int(stride)
+    except (TypeError, ValueError):
+        return False, 0, "Stride must be an integer between 50 and 500 bp"
+    if not 50 <= stride <= 500:
+        return False, stride, "Stride must be between 50 and 500 bp"
+    return True, stride, ""
+def validate_threshold(threshold) -> tuple[bool, float, str]:
+    try:
+        threshold = float(threshold)
+    except (TypeError, ValueError):
+        return False, 0.0, "Threshold must be a number between 0 and 1"
+    if not 0.0 <= threshold <= 1.0:
+        return False, threshold, "Threshold must be between 0 and 1"
+    return True, threshold, ""
+def validate_min_length(min_length) -> tuple[bool, int, str]:
+    try:
+        if isinstance(min_length, float) and not min_length.is_integer():
+            raise ValueError
+        min_length = int(min_length)
+    except (TypeError, ValueError):
+        return False, 0, "Minimum region length must be an integer"
+    if min_length < 1:
+        return False, min_length, "Minimum region length must be at least 1 bp"
+    return True, min_length, ""
+def prediction_error_outputs(message: str):
+    return None, f"**Error**: {message}", [], None, None, None, None, None, ""
+def embedding_error_outputs(message: str):
+    return None, f"**Error**: {message}", None, None
+def make_output_dir(prefix: str) -> str:
+    return tempfile.mkdtemp(prefix=f"{prefix}_")
+def symmetric_activation_norm(values) -> TwoSlopeNorm:
+    values = np.asarray(values, dtype=float)
+    finite = values[np.isfinite(values)]
+    if finite.size == 0:
+        vmax = 1.0
+    else:
+        vmax = max(abs(float(np.nanmin(finite))), abs(float(np.nanmax(finite))))
+        if vmax <= 0:
+            vmax = 1.0
+    return TwoSlopeNorm(vmin=-vmax, vcenter=0, vmax=vmax)
 def create_prediction_plot(positions, probabilities, threshold=0.3, regions=None):
     """Create a matplotlib figure showing the prediction curve (for PNG/PDF export)."""
     fig, ax = plt.subplots(figsize=(12, 4))
     ax.set_ylabel('CRISPR Probability')
     ax.set_title('CRISPR Array Detection Score')
     ax.set_ylim(0, 1)
+    ax.set_xlim(min(positions) if positions else 1, max(positions) if positions else 1000)
     ax.legend(loc='upper right')
     ax.grid(True, alpha=0.3)
     """Create an interactive Plotly figure showing the prediction curve with minimap."""
     fig = go.Figure()
+    min_pos = min(positions) if positions else 1
     max_pos = max(positions) if positions else 1000
+    # Main probability curve with fill - monochrome
     fig.add_trace(go.Scatter(
         x=positions,
         y=probabilities,
         mode='lines',
+        name='Score',
+        line=dict(color='#18181b', width=1.5),
         fill='tozeroy',
+        fillcolor='rgba(24, 24, 27, 0.08)',
         hovertemplate='Position: %{x:,} bp<br>Score: %{y:.3f}<extra></extra>'
     ))
+    # Add threshold line - dashed gray
     fig.add_hline(
         y=threshold,
         line_dash="dash",
+        line_color="#71717a",
+        annotation_text=f"threshold={threshold}",
         annotation_position="top right",
+        annotation_font_size=10,
+        annotation_font_color="#71717a"
     )
+    # Highlight detected CRISPR regions - subtle gray
     if regions:
         for r in regions:
             fig.add_vrect(
                 x0=r['start'], x1=r['end'],
+                fillcolor="rgba(24, 24, 27, 0.06)",
                 layer="below",
                 line_width=1,
+                line_color="rgba(24, 24, 27, 0.2)",
+                annotation_text=f"#{r['region_id']}",
                 annotation_position="top left",
+                annotation_font_size=9,
+                annotation_font_color="#52525b"
             )
     fig.update_layout(
+        title=None,
         xaxis=dict(
+            title=dict(text='Position (bp)', font=dict(size=11, color='#52525b')),
+            range=[min_pos, max_pos],
+            gridcolor='#f4f4f5',
             showgrid=True,
             zeroline=False,
+            linecolor='#e4e4e7',
+            tickfont=dict(size=10, color='#71717a'),
             rangeslider=dict(
                 visible=True,
+                thickness=0.06,
+                bgcolor='#fafafa',
+                bordercolor='#e4e4e7',
                 borderwidth=1
             ),
             rangeselector=dict(
                 buttons=list([
                     dict(count=500, label="500bp", step="all", stepmode="backward"),
                     dict(count=1000, label="1kb", step="all", stepmode="backward"),
                     dict(count=5000, label="5kb", step="all", stepmode="backward"),
+                    dict(step="all", label="all")
                 ]),
+                bgcolor='#fafafa',
+                bordercolor='#e4e4e7',
+                activecolor='#e4e4e7',
+                font=dict(size=9, color='#52525b'),
                 x=0,
+                y=1.12
             )
         ),
         yaxis=dict(
+            title=dict(text='Score', font=dict(size=11, color='#52525b')),
             range=[0, 1.05],
+            gridcolor='#f4f4f5',
             showgrid=True,
             zeroline=False,
+            linecolor='#e4e4e7',
+            tickfont=dict(size=10, color='#71717a'),
             tickformat='.1f'
         ),
         hovermode='x unified',
+        showlegend=False,
+        height=420,
+        plot_bgcolor='#fafafa',
+        paper_bgcolor='#fafafa',
+        margin=dict(t=50, b=60, l=50, r=20),
+        font=dict(family='Inter, system-ui, sans-serif')
     )
     return fig
     # Create figure
     fig, ax = plt.subplots(figsize=(14, max(3, rows * 0.25)))
+    # Use diverging colormap centered at 0; constant embeddings need a non-zero span.
+    norm = symmetric_activation_norm(embedding)
     im = ax.imshow(grid, cmap='RdBu_r', norm=norm, aspect='auto')
     fig, ax = plt.subplots(figsize=(14, max(4, n_windows * 0.3)))
+    # Use diverging colormap; constant embeddings need a non-zero span.
+    norm = symmetric_activation_norm(embeddings)
     im = ax.imshow(embeddings, cmap='RdBu_r', norm=norm, aspect='auto')
 def create_interactive_state_plot(embeddings, n_clusters=8, stride=100, use_3d=False):
     """
+    Create interactive Plotly State-Dynamic Plot with 2D or 3D UMAP - monochrome style.
     """
     embeddings = np.array(embeddings)
     n_windows, n_dims = embeddings.shape
     if n_windows < 5:
         fig = go.Figure()
         fig.add_annotation(text="Need longer sequence (minimum ~1500 bp)",
                           xref="paper", yref="paper", x=0.5, y=0.5,
+                          showarrow=False, font=dict(size=14, color='#71717a'))
+        fig.update_layout(plot_bgcolor='#fafafa', paper_bgcolor='#fafafa')
         return fig
     # UMAP reduction
     hover_text = [f"Window {i}<br>Position: {pos}-{pos+1000} bp<br>Cluster: {c}"
                   for i, (pos, c) in enumerate(zip(positions, cluster_labels))]
+    # Monochrome grayscale palette for clusters
+    grays = [f'rgba({int(40 + i * 180 / n_clusters)}, {int(40 + i * 180 / n_clusters)}, {int(40 + i * 180 / n_clusters)}, 0.8)'
+             for i in range(n_clusters)]
     if use_3d:
         fig = go.Figure()
+        # Trajectory line
         fig.add_trace(go.Scatter3d(
             x=embedding_reduced[:, 0],
             y=embedding_reduced[:, 1],
             z=embedding_reduced[:, 2],
             mode='lines',
+            line=dict(color='rgba(113,113,122,0.3)', width=2),
             name='Trajectory',
             hoverinfo='skip'
         ))
+        # Points - grayscale colorscale
         fig.add_trace(go.Scatter3d(
             x=embedding_reduced[:, 0],
             y=embedding_reduced[:, 1],
             z=embedding_reduced[:, 2],
             mode='markers',
             marker=dict(
+                size=5,
                 color=cluster_labels,
+                colorscale='Greys',
+                opacity=0.85,
+                line=dict(width=0.5, color='white')
             ),
             text=hover_text,
             hovertemplate='%{text}<extra></extra>',
             name='Windows'
         ))
+        # Start marker - dark
         fig.add_trace(go.Scatter3d(
             x=[embedding_reduced[0, 0]],
             y=[embedding_reduced[0, 1]],
             z=[embedding_reduced[0, 2]],
             mode='markers',
+            marker=dict(size=10, color='#18181b', symbol='diamond'),
+            name="5' start"
         ))
+        # End marker - medium gray
         fig.add_trace(go.Scatter3d(
             x=[embedding_reduced[-1, 0]],
             y=[embedding_reduced[-1, 1]],
             z=[embedding_reduced[-1, 2]],
             mode='markers',
+            marker=dict(size=10, color='#71717a', symbol='square'),
+            name="3' end"
         ))
         fig.update_layout(
+            title=None,
             scene=dict(
+                xaxis=dict(title='UMAP 1', gridcolor='#e4e4e7', backgroundcolor='#fafafa'),
+                yaxis=dict(title='UMAP 2', gridcolor='#e4e4e7', backgroundcolor='#fafafa'),
+                zaxis=dict(title='UMAP 3', gridcolor='#e4e4e7', backgroundcolor='#fafafa'),
             ),
+            height=550,
+            showlegend=True,
+            legend=dict(font=dict(size=10), bgcolor='rgba(250,250,250,0.9)'),
+            plot_bgcolor='#fafafa',
+            paper_bgcolor='#fafafa',
+            font=dict(family='Inter, system-ui, sans-serif', color='#52525b')
         )
     else:
             rows=2, cols=2,
             specs=[[{"type": "scatter"}, {"type": "scatter"}],
                    [{"type": "scatter", "colspan": 2}, None]],
+            subplot_titles=('by cluster', 'by position', 'sequence map'),
             row_heights=[0.6, 0.4],
             vertical_spacing=0.12
         )
             x=embedding_reduced[:, 0],
             y=embedding_reduced[:, 1],
             mode='lines',
+            line=dict(color='rgba(113,113,122,0.15)', width=1),
             hoverinfo='skip',
             showlegend=False
         ), row=1, col=1)
                 x=embedding_reduced[mask, 0],
                 y=embedding_reduced[mask, 1],
                 mode='markers',
+                marker=dict(size=7, color=grays[c],
+                           line=dict(width=0.5, color='white')),
                 text=[hover_text[i] for i in np.where(mask)[0]],
                 hovertemplate='%{text}<extra></extra>',
+                name=f'{c}',
                 legendgroup=f'c{c}'
             ), row=1, col=1)
         # Start/End markers
         fig.add_trace(go.Scatter(
             x=[embedding_reduced[0, 0]], y=[embedding_reduced[0, 1]],
+            mode='markers', marker=dict(size=12, color='#18181b', symbol='triangle-up',
+                                        line=dict(width=1, color='white')),
+            name="5'", showlegend=True
         ), row=1, col=1)
         fig.add_trace(go.Scatter(
             x=[embedding_reduced[-1, 0]], y=[embedding_reduced[-1, 1]],
+            mode='markers', marker=dict(size=12, color='#71717a', symbol='square',
+                                        line=dict(width=1, color='white')),
+            name="3'", showlegend=True
         ), row=1, col=1)
+        # Right plot: by position - grayscale gradient
         fig.add_trace(go.Scatter(
             x=embedding_reduced[:, 0],
             y=embedding_reduced[:, 1],
             mode='lines+markers',
+            line=dict(color='rgba(113,113,122,0.2)', width=1),
+            marker=dict(size=7, color=np.arange(n_windows), colorscale='Greys',
+                       showscale=True, colorbar=dict(title=dict(text='window', font=dict(size=10)),
+                                                     x=1.02, tickfont=dict(size=9))),
             text=hover_text,
             hovertemplate='%{text}<extra></extra>',
             showlegend=False
         fig.add_trace(go.Scatter(
             x=[embedding_reduced[0, 0]], y=[embedding_reduced[0, 1]],
+            mode='markers', marker=dict(size=12, color='#18181b', symbol='triangle-up',
+                                        line=dict(width=1, color='white')),
             showlegend=False
         ), row=1, col=2)
         fig.add_trace(go.Scatter(
             x=[embedding_reduced[-1, 0]], y=[embedding_reduced[-1, 1]],
+            mode='markers', marker=dict(size=12, color='#71717a', symbol='square',
+                                        line=dict(width=1, color='white')),
             showlegend=False
         ), row=1, col=2)
+        # Bottom: sequence map - grayscale blocks
         window_size = 1000
         for i, (cluster, pos) in enumerate(zip(cluster_labels, positions)):
             fig.add_trace(go.Scatter(
                 x=[pos, pos + window_size, pos + window_size, pos, pos],
                 y=[0, 0, 1, 1, 0],
                 fill='toself',
+                fillcolor=grays[cluster],
                 line=dict(width=0),
                 hoverinfo='text',
                 text=f'Position {pos}-{pos+window_size} bp<br>Cluster {cluster}',
                 showlegend=False
             ), row=2, col=1)
+        fig.update_xaxes(title_text='UMAP 1', row=1, col=1, gridcolor='#f4f4f5',
+                        tickfont=dict(size=9, color='#71717a'))
+        fig.update_yaxes(title_text='UMAP 2', row=1, col=1, gridcolor='#f4f4f5',
+                        tickfont=dict(size=9, color='#71717a'))
+        fig.update_xaxes(title_text='UMAP 1', row=1, col=2, gridcolor='#f4f4f5',
+                        tickfont=dict(size=9, color='#71717a'))
+        fig.update_yaxes(title_text='UMAP 2', row=1, col=2, gridcolor='#f4f4f5',
+                        tickfont=dict(size=9, color='#71717a'))
+        fig.update_xaxes(title_text='position (bp)', row=2, col=1, gridcolor='#f4f4f5',
+                        tickfont=dict(size=9, color='#71717a'))
         fig.update_yaxes(visible=False, row=2, col=1)
         fig.update_layout(
+            title=None,
+            height=650,
             showlegend=True,
+            legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1,
+                       font=dict(size=9), bgcolor='rgba(250,250,250,0.9)'),
+            plot_bgcolor='#fafafa',
+            paper_bgcolor='#fafafa',
+            font=dict(family='Inter, system-ui, sans-serif', color='#52525b', size=11),
+            margin=dict(t=40, b=40)
         )
+        # Style subplot titles
+        for annotation in fig['layout']['annotations']:
+            annotation['font'] = dict(size=11, color='#52525b')
     return fig
     """Parse a FASTA file and return the sequence."""
     if file_path is None:
         return None
+    size = os.path.getsize(file_path)
+    if size > MAX_UPLOAD_BYTES:
+        raise gr.Error(f"Uploaded file is too large ({size:,} bytes > {MAX_UPLOAD_BYTES:,} byte limit).")
+    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
         content = f.read()
+    is_valid, cleaned, error = normalize_sequence_input(content)
+    if not is_valid:
+        raise gr.Error(error)
+    return cleaned
+def create_gff3_export(regions, sequence_length, sequence_id="input_sequence", output_dir=None):
     """Create GFF3 format annotation file for detected CRISPR regions."""
+    output_dir = output_dir or make_output_dir("crispr_export")
+    gff_path = os.path.join(output_dir, "crispr_regions.gff3")
     with open(gff_path, 'w') as f:
         # GFF3 header
         for r in regions:
             # GFF3 format: seqid source type start end score strand phase attributes
             attributes = f"ID=CRISPR_{r['region_id']};Name=CRISPR_array_{r['region_id']};score={r['mean_score']:.3f}"
+            f.write(f"{sequence_id}\tCRISPR-BERT\tCRISPR_array\t{r['start']}\t{r['end']}\t{r['mean_score']:.3f}\t.\t.\t{attributes}\n")
     return gff_path
 def create_sequence_viewer_html(sequence, positions, probabilities, threshold=0.3, chunk_size=100):
+    """Create an HTML visualization of the sequence with grayscale intensity scores."""
     seq_len = len(sequence)
+    if seq_len > MAX_SEQUENCE_VIEWER_LENGTH:
+        return (
+            '<div style="background: #fafafa; padding: 16px; border: 1px solid #e4e4e7;">'
+            f'Sequence viewer disabled for sequences longer than {MAX_SEQUENCE_VIEWER_LENGTH:,} bp '
+            f'(current sequence: {seq_len:,} bp). Use the plot and downloads for full results.'
+            '</div>'
+        )
+    per_base_scores = np.asarray(probabilities, dtype=float)
+    if len(per_base_scores) != seq_len:
+        per_base_scores = np.resize(per_base_scores, seq_len)
+    # Generate HTML - monochrome style
+    html_parts = ['<div style="font-family: \'Geist Mono\', \'SF Mono\', Consolas, monospace; font-size: 11px; line-height: 1.9; background: #fafafa; padding: 16px; border: 1px solid #e4e4e7; max-height: 400px; overflow-y: auto;">']
+    html_parts.append('<div style="margin-bottom: 12px; font-family: Inter, system-ui, sans-serif; font-size: 11px; color: #71717a;">')
+    html_parts.append('<span style="background: linear-gradient(to right, #fafafa, #18181b); padding: 3px 24px; border: 1px solid #e4e4e7; display: inline-block;">low → high</span>')
+    html_parts.append(f'<span style="margin-left: 12px;">threshold: {threshold}</span>')
     html_parts.append('</div>')
+    # Process sequence in chunks
     for chunk_start in range(0, seq_len, chunk_size):
         chunk_end = min(chunk_start + chunk_size, seq_len)
         chunk_seq = sequence[chunk_start:chunk_end]
         chunk_scores = per_base_scores[chunk_start:chunk_end]
         # Position marker
+        html_parts.append(f'<div><span style="color: #a1a1aa; width: 55px; display: inline-block; font-size: 10px;">{chunk_start+1:,}</span>')
         for i, (base, score) in enumerate(zip(chunk_seq, chunk_scores)):
+            # Grayscale intensity based on score
+            intensity = int(255 - score * 200)  # Higher score = darker
+            color = f'rgb({intensity},{intensity},{intensity})'
+            bg_intensity = int(250 - score * 40)
+            bg_color = f'rgb({bg_intensity},{bg_intensity},{bg_intensity})'
+            font_weight = '600' if score >= threshold else '400'
+            safe_base = html.escape(base)
+            html_parts.append(f'<span style="color: {color}; background-color: {bg_color}; font-weight: {font_weight};" title="pos {chunk_start + i + 1}: {score:.3f}">{safe_base}</span>')
         html_parts.append('</div>')
 # Build interface
 with gr.Blocks(title="CRISPR Array Detection") as demo:
     gr.Markdown("""
+# crispr-detect
+BERT-based CRISPR array detection. 24-layer transformer (430M params) trained on metagenomic sequences.
+Sliding window analysis with per-position probability scores. Export to GFF3/CSV.
     """)
     with gr.Tab("Prediction"):
         with gr.Row():
             with gr.Column(scale=1):
                 seq_input = gr.Textbox(
+                    label="sequence",
                     placeholder="Paste DNA sequence (FASTA format accepted)...",
                     lines=6,
                     value=FLANKED_CRISPR_EXAMPLE,
+                    info="min 1000 bp"
                 )
                 file_upload = gr.File(
+                    label="upload fasta",
                     file_types=[".fasta", ".fa", ".fna", ".txt"],
                     type="filepath"
                 )
                 with gr.Row():
                     stride_input = gr.Slider(
                         minimum=50, maximum=500, value=100, step=50,
+                        label="stride",
+                        info="lower = higher resolution"
                     )
                     threshold_input = gr.Slider(
                         minimum=0.1, maximum=0.9, value=0.3, step=0.05,
+                        label="threshold",
+                        info="lower = sensitive, higher = specific"
                     )
                 with gr.Row():
+                    predict_btn = gr.Button("run", variant="primary", size="lg")
+                gr.Markdown("*examples:*")
                 with gr.Row():
+                    gr.Button("flanked", size="sm").click(
                         lambda: FLANKED_CRISPR_EXAMPLE, outputs=seq_input
                     )
+                    gr.Button("e.coli", size="sm").click(
                         lambda: ECOLI_CRISPR_EXAMPLE, outputs=seq_input
                     )
                 with gr.Row():
+                    gr.Button("crispr", size="sm").click(
                         lambda: CRISPR_EXAMPLE, outputs=seq_input
                     )
+                    gr.Button("control", size="sm").click(
                         lambda: NON_CRISPR_EXAMPLE, outputs=seq_input
                     )
                 result_summary = gr.Markdown()
+                with gr.Accordion("export", open=False, visible=False) as download_accordion:
                     with gr.Row():
+                        pred_download_png = gr.File(label="png", interactive=False)
+                        pred_download_pdf = gr.File(label="pdf", interactive=False)
                     with gr.Row():
+                        pred_download_csv = gr.File(label="csv", interactive=False)
+                        pred_download_gff = gr.File(label="gff3", interactive=False)
                     with gr.Row():
+                        pred_download_summary = gr.File(label="summary", interactive=False)
             with gr.Column(scale=2):
+                plot_output = gr.Plot(label="prediction")
+                with gr.Accordion("sequence", open=False, visible=False) as seq_viewer_accordion:
+                    gr.Markdown("*grayscale intensity = score. hover for values.*")
+                    seq_viewer_html = gr.HTML(label="sequence")
                 regions_output = gr.JSON(label="Detected Regions", visible=False)
         # Handle file upload - load content into textbox
     with gr.Tab("Embeddings"):
         gr.Markdown("""
+### embeddings
+768-dim hidden states from transformer layer 21. UMAP projection + agglomerative clustering.
+Repeats cluster together, spacers form distinct groups.
 """)
         with gr.Row():
             with gr.Column(scale=1):
                 embed_seq = gr.Textbox(
+                    label="sequence",
                     placeholder="Paste DNA sequence...",
                     lines=6,
                     value=EMBEDDING_CRISPR_EXAMPLE,
+                    info="min ~2000 bp for clustering"
                 )
                 embed_mode = gr.Radio(
                     choices=["state-dynamics", "mean", "max", "trajectory"],
                     value="state-dynamics",
+                    label="mode",
+                    info=""
                 )
                 use_3d = gr.Checkbox(
+                    label="3D",
                     value=False,
+                    info="",
                     visible=True
                 )
                 with gr.Row():
+                    embed_btn = gr.Button("extract", variant="primary")
                 with gr.Row():
+                    gr.Button("crispr 3kb", size="sm").click(
                         lambda: EMBEDDING_CRISPR_EXAMPLE, outputs=embed_seq
                     )
+                    gr.Button("control 3kb", size="sm").click(
                         lambda: EMBEDDING_RANDOM_EXAMPLE, outputs=embed_seq
                     )
+                gr.Markdown("*example: 600bp upstream | 25 repeats + 24 spacers | 600bp downstream*")
                 embed_summary = gr.Markdown()
+                with gr.Accordion("export", open=False, visible=False) as embed_download_accordion:
                     with gr.Row():
+                        download_png = gr.File(label="png", interactive=False)
+                        download_pdf = gr.File(label="pdf", interactive=False)
             with gr.Column(scale=2):
+                embed_plot = gr.Plot(label="embedding")
         # Show/hide 3D checkbox based on mode
         embed_mode.change(
     with gr.Tab("API"):
         gr.Markdown("""
+### api
 ```python
 from gradio_client import Client
 client = Client("genomenet/crispr-array-detection")
+# predict
 result = client.predict(
+    sequence="ATGC...",
+    stride=100,
+    threshold=0.3,
     api_name="/predict"
 )
+# embeddings
 result = client.predict(
     sequence="ATGC...",
+    mode="state-dynamics",
     use_3d=False,
     api_name="/get_embedding"
 )
 ```
+**output formats**: CSV (scores), GFF3 (annotations), PNG/PDF (figures)
+**local**:
 ```bash
 git clone https://huggingface.co/spaces/genomenet/crispr-array-detection
+pip install -r requirements.txt && python app.py
 ```
         """)
     with gr.Tab("About"):
         gr.Markdown("""
+### about
+| | |
+|---|---|
+| architecture | BERT, 24 layers, 768 hidden, 12 heads, 430M params |
+| training | metagenomic contigs, microbial genomes, CRISPRCasdb |
+| window | 1000 bp |
+| embedding | layer 21 (768-dim) |
+**parameters**
+| param | default | range |
+|-------|---------|-------|
+| stride | 100 bp | 50-500 |
+| threshold | 0.3 | 0.1-0.9 |
+**citation**
+Mu, Z. (2024). Deep Learning-Based CRISPR Array Detection. Master's Thesis, HZI.
+**acknowledgements**
+DFG SPP 2141 (MC 172) / BMBF de.NBI GenomeNet / HZI BIFO
         """)
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        theme=gr.themes.Base(
+            primary_hue=gr.themes.colors.zinc,
+            secondary_hue=gr.themes.colors.zinc,
+            neutral_hue=gr.themes.colors.zinc,
+            font=gr.themes.GoogleFont("Inter"),
+            font_mono=gr.themes.GoogleFont("Geist Mono"),
+        ),
         css=CUSTOM_CSS
     )

inference/inference.py CHANGED Viewed

@@ -74,6 +74,9 @@ def predict_batch(
     Returns:
         Predictions of shape (N, window_size) with probabilities
     """
     expected_dtype = model.inputs[0].dtype
     windows = cast_for_model(windows, expected_dtype)
@@ -121,6 +124,9 @@ def aggregate_predictions(
     Returns:
         Per-position probability array of shape (seq_length,)
     """
     scores = np.zeros(seq_length, dtype=np.float32)
     counts = np.zeros(seq_length, dtype=np.int32)
@@ -162,8 +168,10 @@ def predict_sequence(
     Returns:
         PredictionResult with per-position probabilities
     """
-    if model is None:
-        model = get_model()
     # Tokenize sequence
     tokens = encode_sequence(sequence)
@@ -172,6 +180,9 @@ def predict_sequence(
     # Create sliding windows
     windows, starts = create_windows(tokens, window_size=WINDOW_SIZE, stride=stride)
     logger.info(f"Processing sequence: {seq_length} bp, {len(windows)} windows (stride={stride})")
     # Run batched prediction
@@ -211,6 +222,9 @@ def embed_batch(
     Returns:
         Embeddings of shape (N, window_size, embed_dim) or (N, embed_dim)
     """
     expected_dtype = model.inputs[0].dtype
     windows = cast_for_model(windows, expected_dtype)
@@ -249,8 +263,10 @@ def embed_sequence(
     Returns:
         EmbeddingResult (for mean/cls/max) or TrajectoryResult (for trajectory)
     """
-    if model is None:
-        model = get_embedding_model()
     # Tokenize sequence
     tokens = encode_sequence(sequence)
@@ -259,6 +275,9 @@ def embed_sequence(
     # Create windows
     windows, starts = create_windows(tokens, window_size=WINDOW_SIZE, stride=stride)
     logger.info(f"Extracting embeddings: {seq_length} bp, {len(windows)} windows")
     # Get embeddings (shape: N, window_size, embed_dim)
@@ -306,7 +325,8 @@ def detect_crispr_regions(
     min_length: int = 160,
     merge_gap: int = 80,
     stride: int = 100,
-    model: Optional[tf.keras.Model] = None
 ) -> list[dict]:
     """
     Detect CRISPR array regions in a sequence.
@@ -322,8 +342,16 @@ def detect_crispr_regions(
     Returns:
         List of detected regions with coordinates and scores
     """
-    # Get per-position predictions
-    result = predict_sequence(sequence, stride=stride, model=model)
     scores = np.array(result.probabilities)
     # Threshold to binary mask

     Returns:
         Predictions of shape (N, window_size) with probabilities
     """
+    if batch_size <= 0:
+        raise ValueError("batch_size must be a positive integer")
     expected_dtype = model.inputs[0].dtype
     windows = cast_for_model(windows, expected_dtype)
     Returns:
         Per-position probability array of shape (seq_length,)
     """
+    if aggregation not in {"mean", "max"}:
+        raise ValueError("aggregation must be 'mean' or 'max'")
     scores = np.zeros(seq_length, dtype=np.float32)
     counts = np.zeros(seq_length, dtype=np.int32)
     Returns:
         PredictionResult with per-position probabilities
     """
+    if aggregation not in {"mean", "max"}:
+        raise ValueError("aggregation must be 'mean' or 'max'")
+    if batch_size <= 0:
+        raise ValueError("batch_size must be a positive integer")
     # Tokenize sequence
     tokens = encode_sequence(sequence)
     # Create sliding windows
     windows, starts = create_windows(tokens, window_size=WINDOW_SIZE, stride=stride)
+    if model is None:
+        model = get_model()
     logger.info(f"Processing sequence: {seq_length} bp, {len(windows)} windows (stride={stride})")
     # Run batched prediction
     Returns:
         Embeddings of shape (N, window_size, embed_dim) or (N, embed_dim)
     """
+    if batch_size <= 0:
+        raise ValueError("batch_size must be a positive integer")
     expected_dtype = model.inputs[0].dtype
     windows = cast_for_model(windows, expected_dtype)
     Returns:
         EmbeddingResult (for mean/cls/max) or TrajectoryResult (for trajectory)
     """
+    if mode not in {"mean", "cls", "max", "trajectory"}:
+        raise ValueError("mode must be one of: mean, cls, max, trajectory")
+    if batch_size <= 0:
+        raise ValueError("batch_size must be a positive integer")
     # Tokenize sequence
     tokens = encode_sequence(sequence)
     # Create windows
     windows, starts = create_windows(tokens, window_size=WINDOW_SIZE, stride=stride)
+    if model is None:
+        model = get_embedding_model()
     logger.info(f"Extracting embeddings: {seq_length} bp, {len(windows)} windows")
     # Get embeddings (shape: N, window_size, embed_dim)
     min_length: int = 160,
     merge_gap: int = 80,
     stride: int = 100,
+    model: Optional[tf.keras.Model] = None,
+    prediction_result: Optional[PredictionResult] = None
 ) -> list[dict]:
     """
     Detect CRISPR array regions in a sequence.
     Returns:
         List of detected regions with coordinates and scores
     """
+    if not 0.0 <= threshold <= 1.0:
+        raise ValueError("threshold must be between 0 and 1")
+    if min_length < 1:
+        raise ValueError("min_length must be at least 1")
+    if merge_gap < 0:
+        raise ValueError("merge_gap must be non-negative")
+    # Get per-position predictions, or reuse a caller-provided result to avoid
+    # running the model twice in UI flows that need both scores and regions.
+    result = prediction_result or predict_sequence(sequence, stride=stride, model=model)
     scores = np.array(result.probabilities)
     # Threshold to binary mask

inference/tokenizer.py CHANGED Viewed

@@ -11,7 +11,6 @@ Token mapping:
 """
 import numpy as np
-from typing import Union
 VOCAB_SIZE = 6
 WINDOW_SIZE = 1000
@@ -30,6 +29,22 @@ _LUT[ord("g")] = 3
 _LUT[ord("t")] = 4
 def encode_sequence(sequence: str) -> np.ndarray:
     """
     Convert DNA sequence string to integer token array.
@@ -43,7 +58,10 @@ def encode_sequence(sequence: str) -> np.ndarray:
     # Convert to uppercase for consistency
     seq_upper = sequence.upper()
     # Convert to bytes and apply lookup
-    seq_bytes = np.frombuffer(seq_upper.encode("ascii"), dtype=np.uint8)
     return _LUT[seq_bytes]
@@ -69,7 +87,8 @@ def validate_sequence(sequence: str) -> tuple[bool, str]:
     invalid_chars = seq_chars - valid_chars
     if invalid_chars:
-        return False, f"Invalid characters in sequence: {invalid_chars}"
     return True, ""
@@ -84,8 +103,13 @@ def strip_fasta_header(text: str) -> str:
     Returns:
         Sequence string with headers removed
     """
-    lines = text.strip().split("\n")
-    sequence_lines = [line.strip() for line in lines if not line.startswith(">")]
     return "".join(sequence_lines)
@@ -105,6 +129,8 @@ def create_windows(
     Returns:
         Tuple of (windows array, start positions array)
     """
     seq_len = len(tokens)
     if seq_len < window_size:

 """
 import numpy as np
 VOCAB_SIZE = 6
 WINDOW_SIZE = 1000
 _LUT[ord("t")] = 4
+def _coerce_positive_int(name: str, value) -> int:
+    """Accept int-like values from UI/API inputs and reject unsafe strides."""
+    if isinstance(value, bool):
+        raise ValueError(f"{name} must be a positive integer")
+    if isinstance(value, (int, np.integer)):
+        parsed = int(value)
+    elif isinstance(value, float) and value.is_integer():
+        parsed = int(value)
+    else:
+        raise ValueError(f"{name} must be a positive integer")
+    if parsed <= 0:
+        raise ValueError(f"{name} must be a positive integer")
+    return parsed
 def encode_sequence(sequence: str) -> np.ndarray:
     """
     Convert DNA sequence string to integer token array.
     # Convert to uppercase for consistency
     seq_upper = sequence.upper()
     # Convert to bytes and apply lookup
+    try:
+        seq_bytes = np.frombuffer(seq_upper.encode("ascii"), dtype=np.uint8)
+    except UnicodeEncodeError as exc:
+        raise ValueError("Sequence contains non-ASCII characters") from exc
     return _LUT[seq_bytes]
     invalid_chars = seq_chars - valid_chars
     if invalid_chars:
+        invalid = ", ".join(repr(c) for c in sorted(invalid_chars))
+        return False, f"Invalid characters in sequence: {invalid}"
     return True, ""
     Returns:
         Sequence string with headers removed
     """
+    lines = text.strip().splitlines()
+    sequence_lines = []
+    for line in lines:
+        line = line.strip()
+        if not line or line.startswith(">"):
+            continue
+        sequence_lines.append(line)
     return "".join(sequence_lines)
     Returns:
         Tuple of (windows array, start positions array)
     """
+    window_size = _coerce_positive_int("window_size", window_size)
+    stride = _coerce_positive_int("stride", stride)
     seq_len = len(tokens)
     if seq_len < window_size: