Spaces:

genomenet
/

bert-embedding

Sleeping

genomenet Claude Opus 4.7 (1M context) commited on 22 days ago

Commit

bbe3d0a

1 Parent(s): f48b1be

Add MLM surprise tab: per-base -log p(true) along the sequence

Uses the model's MLM head (final Dense-6 over the nucleotide vocabulary) that
the model was actually pretrained with. For each sliding window we mask ~15%
of positions, run one forward pass, softmax the logits, and read off
-log(p_true) at the masked positions. Aggregation:

- per-window mean surprise -> line plot with ln(6) uniform baseline
- per-base scatter at masked positions -> finer-grained view of local spikes

Low values = model confidently reconstructs the base from context (conserved
or training-typical motifs). High values near ln(6) = model is near-uniform
(unusual relative to training distribution).

One forward pass per window at the default stride, so runtime is the same
as the embedding extraction path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

app.py +176 -0

app.py CHANGED Viewed

@@ -149,6 +149,98 @@ def embed_sequence(sequence, mode="mean", stride=100, layer=21):
         window_emb = np.mean(embeddings, axis=1)
         return np.mean(window_emb, axis=0), window_emb, positions
 def create_embedding_heatmap(embedding, title="Embedding"):
     """Create a heatmap of a single embedding vector."""
     embedding = np.array(embedding)
@@ -381,6 +473,43 @@ def process(sequence: str, mode: str, stride: int, layer: int):
     return summary, path, heatmap_fig, trajectory_fig, familiarity_fig, dims_fig
 # Build interface
 with gr.Blocks(
     title="BERT Metagenome Embeddings",
@@ -422,6 +551,39 @@ with gr.Blocks(
         api_name="embed"
     )
     with gr.Tab("API"):
         gr.Markdown("""
 ### API
@@ -448,6 +610,20 @@ embedding = np.load(emb_path)
   from the rest of the sequence. Spikes = unusual regions relative to context.
 Numeric stats (L2, entropy, sparsity, kurtosis) are in the summary text.
         """)
     with gr.Tab("About"):

         window_emb = np.mean(embeddings, axis=1)
         return np.mean(window_emb, axis=0), window_emb, positions
+# ln(vocab_size=6): surprise if the model predicted uniformly at random.
+UNIFORM_SURPRISE = float(np.log(6))
+MASK_TOKEN = 0  # PAD/OOV; used as the MLM mask slot
+def compute_mlm_surprise(sequence, stride=100, mask_fraction=0.15, seed=42):
+    """Per-window and per-base MLM surprise.
+    For each sliding window, randomly mask ~mask_fraction of positions, run one
+    forward pass through the full model (which ends in a Dense(vocab_size=6)),
+    softmax the per-position logits, and take -log(p_true) at the masked
+    positions. Returns:
+    - per_window: list of (position, mean_surprise)
+    - per_base_pos, per_base_vals: flat arrays of (position, surprise) samples,
+      one entry per (window × masked_position). Overlapping windows give
+      multiple observations per base.
+    """
+    model = get_base_model()
+    tokens = tokenize(sequence)
+    seq_len = len(tokens)
+    rng = np.random.default_rng(seed)
+    n_mask = max(1, int(WINDOW_SIZE * mask_fraction))
+    per_window = []
+    per_base_pos = []
+    per_base_vals = []
+    for start in range(0, seq_len - WINDOW_SIZE + 1, stride):
+        window = tokens[start:start + WINDOW_SIZE].copy()
+        true_tokens = window.copy()
+        mask_idx = rng.choice(WINDOW_SIZE, size=n_mask, replace=False)
+        window[mask_idx] = MASK_TOKEN
+        logits = model.predict(window[np.newaxis, :], verbose=0)[0]  # (1000, 6)
+        logits -= logits.max(axis=-1, keepdims=True)
+        exp_l = np.exp(logits)
+        probs = exp_l / exp_l.sum(axis=-1, keepdims=True)
+        surprises = -np.log(np.clip(probs[mask_idx, true_tokens[mask_idx]], 1e-10, None))
+        per_window.append((start + WINDOW_SIZE // 2, float(surprises.mean())))
+        per_base_pos.extend((start + mask_idx).tolist())
+        per_base_vals.extend(surprises.tolist())
+    return per_window, np.array(per_base_pos), np.array(per_base_vals)
+def create_surprise_plot(per_window, per_base_pos, per_base_vals, seq_len):
+    """Two-panel Plotly figure: per-window surprise line + per-base scatter."""
+    from plotly.subplots import make_subplots
+    fig = make_subplots(
+        rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.08,
+        row_heights=[0.6, 0.4],
+        subplot_titles=('per-window mean surprise (lower = model finds region predictable)',
+                        'per-base surprise at masked positions (dots; darker = more surprising)')
+    )
+    wx = [p for p, _ in per_window]
+    wy = [s for _, s in per_window]
+    fig.add_trace(go.Scatter(
+        x=wx, y=wy, mode='lines+markers',
+        line=dict(color='#18181b', width=2), marker=dict(size=6),
+        hovertemplate='center %{x} bp<br>surprise %{y:.3f} nats<extra></extra>',
+        showlegend=False,
+    ), row=1, col=1)
+    fig.add_hline(
+        y=UNIFORM_SURPRISE, line_dash='dash', line_color='#a1a1aa',
+        annotation_text=f'uniform baseline (ln 6 = {UNIFORM_SURPRISE:.2f})',
+        annotation_position='top right', annotation_font=dict(size=10, color='#71717a'),
+        row=1, col=1,
+    )
+    fig.add_trace(go.Scatter(
+        x=per_base_pos, y=per_base_vals, mode='markers',
+        marker=dict(size=4, color=per_base_vals, colorscale='Reds',
+                    cmin=0, cmax=UNIFORM_SURPRISE,
+                    colorbar=dict(title=dict(text='nats', font=dict(size=10)),
+                                  thickness=10, len=0.35, y=0.18, tickfont=dict(size=9))),
+        hovertemplate='pos %{x} bp<br>surprise %{y:.3f}<extra></extra>',
+        showlegend=False,
+    ), row=2, col=1)
+    fig.update_xaxes(title_text='position (bp)', row=2, col=1, range=[0, seq_len])
+    fig.update_xaxes(range=[0, seq_len], row=1, col=1)
+    fig.update_yaxes(title_text='nats', row=1, col=1, rangemode='tozero')
+    fig.update_yaxes(title_text='nats', row=2, col=1, rangemode='tozero')
+    fig.update_layout(height=520, margin=dict(l=50, r=20, t=50, b=50))
+    for ann in fig['layout']['annotations']:
+        if 'font' not in ann:
+            ann['font'] = dict(size=11)
+    return fig
 def create_embedding_heatmap(embedding, title="Embedding"):
     """Create a heatmap of a single embedding vector."""
     embedding = np.array(embedding)
     return summary, path, heatmap_fig, trajectory_fig, familiarity_fig, dims_fig
+def process_surprise(sequence: str, stride: int, mask_fraction: float):
+    """Compute MLM surprise across the sequence."""
+    sequence = strip_fasta_header(sequence.strip())
+    is_valid, error = validate_sequence(sequence)
+    if not is_valid:
+        return f"**Error**: {error}", None
+    per_window, per_base_pos, per_base_vals = compute_mlm_surprise(
+        sequence, stride=stride, mask_fraction=mask_fraction
+    )
+    if not per_window:
+        return "**Error**: sequence too short for one window", None
+    fig = create_surprise_plot(per_window, per_base_pos, per_base_vals, len(sequence))
+    w_vals = np.array([s for _, s in per_window])
+    lo_pos, lo_val = per_window[int(np.argmin(w_vals))]
+    hi_pos, hi_val = per_window[int(np.argmax(w_vals))]
+    summary = f"""### MLM surprise
+| | |
+|---|---|
+| sequence | {len(sequence):,} bp |
+| windows | {len(per_window)} |
+| mask fraction | {mask_fraction:.0%} |
+| mean surprise | {w_vals.mean():.3f} nats |
+| uniform baseline | {UNIFORM_SURPRISE:.3f} nats (ln 6) |
+| most predictable window | {lo_val:.3f} nats @ ~{lo_pos:,} bp |
+| most surprising window | {hi_val:.3f} nats @ ~{hi_pos:,} bp |
+Lower = model confidently predicts the true base → conserved/typical pattern.
+Higher = model is unsure → unusual region relative to training distribution.
+"""
+    return summary, fig
 # Build interface
 with gr.Blocks(
     title="BERT Metagenome Embeddings",
         api_name="embed"
     )
+    with gr.Tab("MLM surprise"):
+        gr.Markdown("""
+Per-base "surprise" from the model's masked-language-modeling head.
+Each window randomly masks ~15% of positions, one forward pass predicts them,
+and we measure how hard the model finds each true base to reconstruct.
+**Lower** = conserved/predictable pattern. **Higher** = unusual region.
+""")
+        with gr.Row():
+            with gr.Column(scale=1, min_width=260):
+                surp_seq = gr.Textbox(
+                    label="sequence",
+                    placeholder="Paste DNA (FASTA or raw)...",
+                    lines=8,
+                    value=EXAMPLE_SEQUENCE,
+                )
+                surp_stride = gr.Slider(50, 500, value=100, step=50, label="stride",
+                                        info="lower = finer resolution, more compute")
+                surp_mask = gr.Slider(0.05, 0.5, value=0.15, step=0.05,
+                                      label="mask fraction",
+                                      info="fraction of positions masked per window")
+                surp_btn = gr.Button("score", variant="primary")
+            with gr.Column(scale=3, min_width=500):
+                surp_summary = gr.Markdown()
+                surp_plot = gr.Plot(label="surprise along sequence")
+        surp_btn.click(
+            process_surprise,
+            inputs=[surp_seq, surp_stride, surp_mask],
+            outputs=[surp_summary, surp_plot],
+            api_name="surprise",
+        )
     with gr.Tab("API"):
         gr.Markdown("""
 ### API
   from the rest of the sequence. Spikes = unusual regions relative to context.
 Numeric stats (L2, entropy, sparsity, kurtosis) are in the summary text.
+### MLM surprise endpoint
+```python
+summary, plot = client.predict(
+    sequence="ATGC...",
+    stride=100,
+    mask_fraction=0.15,
+    api_name="/surprise",
+)
+```
+Returns per-window mean `-log(p_true)` at masked positions (in nats).
+Uniform-random baseline is `ln(6) ≈ 1.79 nats`.
         """)
     with gr.Tab("About"):