Spaces:

genomenet
/

bert-embedding

Sleeping

App Files Files Community

genomenet commited on 23 days ago

Commit

038ad80

1 Parent(s): 0c6b9b9

Add embedding stats, full-width layout, 3-column design

Browse files

Files changed (1) hide show

app.py +166 -172

app.py CHANGED Viewed

@@ -13,9 +13,7 @@ from huggingface_hub import hf_hub_download
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
-from matplotlib.colors import TwoSlopeNorm
 import plotly.graph_objects as go
-from plotly.subplots import make_subplots
 from custom_layers import get_custom_objects
@@ -23,12 +21,12 @@ from custom_layers import get_custom_objects
 MODEL_REPO = "genomenet/bert-metagenome"
 MODEL_FILE = "bert_1k_3.h5"
 WINDOW_SIZE = 1000
-NUM_LAYERS = 24  # Transformer blocks 0-23
 EMBEDDING_DIM = 768
 # Singleton model cache
 _model = None
-_embedding_models = {}  # layer_idx -> embedding_model
 def get_base_model():
     """Load and cache the base model."""
@@ -39,6 +37,8 @@ def get_base_model():
         print(f"Loading model from {model_path}...")
         _model = tf.keras.models.load_model(model_path, custom_objects=get_custom_objects(), compile=False)
         print("Model loaded.")
     return _model
 def get_embedding_model(layer_idx=21):
@@ -53,7 +53,6 @@ def get_embedding_model(layer_idx=21):
                 outputs=model.get_layer(layer_name).output
             )
         except ValueError:
-            # Fallback to layer 21 if requested layer not found
             _embedding_models[layer_idx] = tf.keras.Model(
                 inputs=model.input,
                 outputs=model.get_layer("layer_transformer_block_21").output
@@ -62,25 +61,14 @@ def get_embedding_model(layer_idx=21):
 def get_gpu_status():
     gpus = tf.config.list_physical_devices('GPU')
-    if gpus:
-        return f"GPU: {gpus[0].name}"
-    return "CPU only"
-# Tokenization - Integer token IDs
 TOKEN_MAP = {'A': 1, 'C': 2, 'G': 3, 'T': 4, 'N': 5}
 def tokenize(sequence):
-    """Convert DNA sequence to integer token IDs."""
     sequence = sequence.upper().replace('U', 'T')
-    tokens = []
-    for char in sequence:
-        if char in TOKEN_MAP:
-            tokens.append(TOKEN_MAP[char])
-        elif char in 'RYSWKMBDHV':
-            tokens.append(5)
-        else:
-            tokens.append(5)
-    return np.array(tokens, dtype=np.int32)
 def validate_sequence(sequence):
     if not sequence or len(sequence.strip()) == 0:
@@ -97,29 +85,57 @@ def validate_sequence(sequence):
 def strip_fasta_header(text):
     lines = text.strip().split('\n')
-    seq_lines = [l for l in lines if not l.startswith('>')]
-    return ''.join(seq_lines).replace(' ', '').replace('\t', '')
 def embed_sequence(sequence, mode="mean", stride=100, layer=21):
     """Extract embeddings from sequence."""
     model = get_embedding_model(layer)
     seq_len = len(sequence)
     embeddings = []
     positions = []
     for start in range(0, seq_len - WINDOW_SIZE + 1, stride):
         window = sequence[start:start + WINDOW_SIZE]
-        tokens = tokenize(window)
-        tokens = np.expand_dims(tokens, axis=0)
         emb = model.predict(tokens, verbose=0)
         embeddings.append(emb[0])
         positions.append(start)
     embeddings = np.array(embeddings)  # (n_windows, 1000, 768)
-    # Pool across sequence positions within each window
     if mode == "mean":
         window_emb = np.mean(embeddings, axis=1)
         return np.mean(window_emb, axis=0), window_emb, positions
@@ -140,120 +156,116 @@ def create_embedding_heatmap(embedding, title="Embedding"):
     cols = 32
     rows = int(np.ceil(n_dims / cols))
-    # Pad to fill grid
     padded = np.full(rows * cols, np.nan)
     padded[:n_dims] = embedding
     grid = padded.reshape(rows, cols)
-    # Symmetric normalization
     finite = embedding[np.isfinite(embedding)]
-    if finite.size > 0:
-        vmax = max(abs(np.nanmin(finite)), abs(np.nanmax(finite)), 0.01)
-    else:
-        vmax = 1.0
-    fig, ax = plt.subplots(figsize=(12, max(3, rows * 0.3)))
     im = ax.imshow(grid, cmap='RdBu_r', vmin=-vmax, vmax=vmax, aspect='auto')
-    cbar = plt.colorbar(im, ax=ax, shrink=0.8)
-    cbar.set_label('Activation', fontsize=9)
-    ax.set_xlabel('Dimension', fontsize=9)
-    ax.set_ylabel('Row', fontsize=9)
-    ax.set_title(f'{title} ({n_dims} dims)', fontsize=10)
     ax.set_xticks(np.arange(0, cols, 8))
     plt.tight_layout()
     return fig
-def create_trajectory_plot(window_embeddings, positions, stride):
-    """Create interactive trajectory plot showing embedding evolution."""
-    n_windows = len(window_embeddings)
-    # Subsample dimensions for visualization
     emb = np.array(window_embeddings)
-    n_dims = emb.shape[1]
-    if n_dims > 100:
-        step = n_dims // 100
-        emb_sub = emb[:, ::step]
-    else:
-        emb_sub = emb
-    # Create heatmap
-    fig = go.Figure()
-    # Symmetric color scale
     vmax = max(abs(np.nanmin(emb_sub)), abs(np.nanmax(emb_sub)), 0.01)
-    fig.add_trace(go.Heatmap(
         z=emb_sub,
         x=list(range(emb_sub.shape[1])),
-        y=[f"{p}-{p+WINDOW_SIZE}" for p in positions],
         colorscale='RdBu_r',
         zmin=-vmax, zmax=vmax,
-        colorbar=dict(title='Activation'),
-        hovertemplate='Window: %{y}<br>Dim: %{x}<br>Value: %{z:.3f}<extra></extra>'
     ))
     fig.update_layout(
-        title=None,
-        xaxis=dict(title='Dimension (subsampled)' if n_dims > 100 else 'Dimension',
-                   tickfont=dict(size=9)),
-        yaxis=dict(title='Window position (bp)', tickfont=dict(size=9)),
-        height=max(300, n_windows * 20 + 100),
-        plot_bgcolor='#fafafa',
-        paper_bgcolor='#fafafa',
-        font=dict(family='Inter, system-ui, sans-serif', size=10)
     )
     return fig
-def create_dimension_plot(window_embeddings, positions, top_k=10):
-    """Show top varying dimensions across windows."""
     emb = np.array(window_embeddings)
-    # Find dimensions with highest variance
     variances = np.var(emb, axis=0)
     top_dims = np.argsort(variances)[-top_k:][::-1]
-    fig = go.Figure()
-    colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00',
-              '#a65628', '#f781bf', '#999999', '#66c2a5', '#fc8d62']
     for i, dim in enumerate(top_dims):
         fig.add_trace(go.Scatter(
-            x=positions,
-            y=emb[:, dim],
-            mode='lines',
-            name=f'dim {dim}',
-            line=dict(color=colors[i % len(colors)], width=1.5),
-            hovertemplate=f'Dim {dim}<br>Pos: %{{x}}<br>Value: %{{y:.3f}}<extra></extra>'
         ))
     fig.update_layout(
-        title=None,
-        xaxis=dict(title='Position (bp)', tickfont=dict(size=9)),
-        yaxis=dict(title='Activation', tickfont=dict(size=9)),
-        height=350,
-        legend=dict(orientation='h', yanchor='bottom', y=1.02, font=dict(size=9)),
-        plot_bgcolor='#fafafa',
-        paper_bgcolor='#fafafa',
-        font=dict(family='Inter, system-ui, sans-serif', size=10)
     )
     return fig
-# Example sequence (1100 bp)
 EXAMPLE_SEQUENCE = """ATGCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTACGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCG"""
-def process(sequence: str, mode: str, stride: int, layer: int, show_heatmap: bool, show_trajectory: bool):
     """Main processing function."""
     sequence = strip_fasta_header(sequence.strip())
     is_valid, error = validate_sequence(sequence)
     if not is_valid:
-        return f"**Error**: {error}", None, None, None, None
     embedding, window_embeddings, positions = embed_sequence(
         sequence, mode=mode, stride=stride, layer=layer
@@ -263,22 +275,29 @@ def process(sequence: str, mode: str, stride: int, layer: int, show_heatmap: boo
     path = os.path.join(tempfile.gettempdir(), "embedding.npy")
     np.save(path, embedding)
     # Create summary
     if mode == "per-window":
-        emb_shape = f"({embedding.shape[0]}, {embedding.shape[1]})"
-        summary = f"""## Embeddings extracted
 | | |
 |---|---|
 | sequence | {len(sequence):,} bp |
 | layer | {layer} |
 | windows | {embedding.shape[0]} |
-| dim | {embedding.shape[1]} |
-| stride | {stride} bp |
 """
     else:
-        emb_str = ", ".join([f"{x:.3f}" for x in embedding[:8]])
-        summary = f"""## Embedding extracted
 | | |
 |---|---|
@@ -287,81 +306,60 @@ def process(sequence: str, mode: str, stride: int, layer: int, show_heatmap: boo
 | mode | {mode} |
 | dim | {len(embedding)} |
-**First 8 dims**: [{emb_str}, ...]
 """
     # Create visualizations
     heatmap_fig = None
-    trajectory_fig = None
-    dims_fig = None
-    if show_heatmap and mode != "per-window":
-        heatmap_fig = create_embedding_heatmap(embedding, f"Layer {layer} Embedding")
-    if show_trajectory and len(window_embeddings) > 1:
-        trajectory_fig = create_trajectory_plot(window_embeddings, positions, stride)
-        dims_fig = create_dimension_plot(window_embeddings, positions)
-    return summary, path, heatmap_fig, trajectory_fig, dims_fig
-# CSS
-CUSTOM_CSS = """
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500&display=swap');
-* { font-family: 'Inter', system-ui, sans-serif !important; }
-code, pre, textarea { font-family: 'SF Mono', Consolas, monospace !important; }
-.gradio-container { max-width: 1100px !important; background: #fafafa !important; }
-"""
 # Build interface
-with gr.Blocks(title="BERT Metagenome Embeddings", css=CUSTOM_CSS) as demo:
-    gr.Markdown("""
-# bert-embedding
-Extract embeddings from DNA sequences. BERT model (430M params) pretrained on metagenomic sequences.
-    """)
     with gr.Tab("Extract"):
         with gr.Row():
-            with gr.Column(scale=1):
                 seq_input = gr.Textbox(
                     label="sequence",
-                    placeholder="Paste DNA sequence (FASTA or raw)...",
-                    lines=6,
-                    value=EXAMPLE_SEQUENCE,
-                    info="min 1000 bp"
                 )
                 with gr.Row():
                     mode_input = gr.Radio(
                         choices=["mean", "max", "per-window"],
-                        value="mean",
-                        label="pooling"
-                    )
-                    layer_input = gr.Slider(
-                        minimum=0, maximum=23, value=21, step=1,
-                        label="layer",
-                        info="transformer block (0-23)"
-                    )
-                with gr.Row():
-                    stride_input = gr.Slider(
-                        minimum=50, maximum=500, value=100, step=50,
-                        label="stride"
                     )
                 with gr.Row():
-                    show_heatmap = gr.Checkbox(label="heatmap", value=True)
-                    show_trajectory = gr.Checkbox(label="trajectory", value=True)
                 btn = gr.Button("extract", variant="primary")
                 output = gr.Markdown()
-                download = gr.File(label="download")
-            with gr.Column(scale=2):
                 heatmap_plot = gr.Plot(label="embedding heatmap")
                 trajectory_plot = gr.Plot(label="window trajectory")
                 dims_plot = gr.Plot(label="top varying dimensions")
     btn.click(
         process,
-        inputs=[seq_input, mode_input, stride_input, layer_input, show_heatmap, show_trajectory],
-        outputs=[output, download, heatmap_plot, trajectory_plot, dims_plot],
         api_name="embed"
     )
@@ -374,33 +372,25 @@ from gradio_client import Client
 import numpy as np
 client = Client("genomenet/bert-embedding")
 result = client.predict(
-    sequence="ATGCGATCGATCG...",  # min 1000 bp
-    mode="mean",                   # "mean", "max", or "per-window"
     stride=100,
-    layer=21,                      # transformer layer 0-23
-    show_heatmap=True,
-    show_trajectory=True,
     api_name="/embed"
 )
 summary, emb_path, *plots = result
 embedding = np.load(emb_path)
 ```
-**Layers**: 0-23 (24 transformer blocks). Layer 21 is commonly used for embeddings.
-**Modes**:
-- `mean`: Single 768-dim vector (mean pooled)
-- `max`: Single 768-dim vector (max pooled)
-- `per-window`: Matrix `(n_windows, 768)`
-**Local**:
-```bash
-git clone https://huggingface.co/spaces/genomenet/bert-embedding
-pip install -r requirements.txt && python app.py
-```
         """)
     with gr.Tab("About"):
@@ -411,20 +401,24 @@ pip install -r requirements.txt && python app.py
 |---|---|
 | architecture | BERT, 24 layers, 768 hidden, 12 heads |
 | parameters | ~430M |
-| input | 1000 bp DNA (sliding window) |
-| output | 768-dim embedding per position |
 | pretraining | metagenomic contigs + microbial genomes |
-### Visualization
-- **Heatmap**: 768 dimensions as colored grid. Blue=negative, Red=positive activation.
-- **Trajectory**: How embeddings change across sliding windows. Useful for seeing sequence structure.
-- **Top dimensions**: Dimensions with highest variance - most informative for distinguishing sequence regions.
-### Links
 - Model: [genomenet/bert-metagenome](https://huggingface.co/genomenet/bert-metagenome)
-- CRISPR Detection: [genomenet/crispr-array-detection](https://huggingface.co/spaces/genomenet/crispr-array-detection)
         """)
 if __name__ == "__main__":

 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import plotly.graph_objects as go
 from custom_layers import get_custom_objects
 MODEL_REPO = "genomenet/bert-metagenome"
 MODEL_FILE = "bert_1k_3.h5"
 WINDOW_SIZE = 1000
+NUM_LAYERS = 24
 EMBEDDING_DIM = 768
 # Singleton model cache
 _model = None
+_embedding_models = {}
 def get_base_model():
     """Load and cache the base model."""
         print(f"Loading model from {model_path}...")
         _model = tf.keras.models.load_model(model_path, custom_objects=get_custom_objects(), compile=False)
         print("Model loaded.")
+        # Print model summary for debugging
+        print(f"Model outputs: {_model.output_names}")
     return _model
 def get_embedding_model(layer_idx=21):
                 outputs=model.get_layer(layer_name).output
             )
         except ValueError:
             _embedding_models[layer_idx] = tf.keras.Model(
                 inputs=model.input,
                 outputs=model.get_layer("layer_transformer_block_21").output
 def get_gpu_status():
     gpus = tf.config.list_physical_devices('GPU')
+    return f"GPU: {gpus[0].name}" if gpus else "CPU only"
+# Tokenization
 TOKEN_MAP = {'A': 1, 'C': 2, 'G': 3, 'T': 4, 'N': 5}
 def tokenize(sequence):
     sequence = sequence.upper().replace('U', 'T')
+    return np.array([TOKEN_MAP.get(c, 5) for c in sequence], dtype=np.int32)
 def validate_sequence(sequence):
     if not sequence or len(sequence.strip()) == 0:
 def strip_fasta_header(text):
     lines = text.strip().split('\n')
+    return ''.join(l for l in lines if not l.startswith('>')).replace(' ', '').replace('\t', '')
+def compute_embedding_stats(embedding):
+    """Compute statistics that may indicate sequence 'familiarity'."""
+    emb = np.array(embedding)
+    # L2 norm - magnitude of response
+    l2_norm = np.linalg.norm(emb)
+    # Mean activation
+    mean_act = np.mean(emb)
+    # Std - spread of activations
+    std_act = np.std(emb)
+    # Sparsity - fraction of near-zero activations
+    sparsity = np.mean(np.abs(emb) < 0.1)
+    # Activation entropy (discretized)
+    hist, _ = np.histogram(emb, bins=50, density=True)
+    hist = hist[hist > 0]
+    entropy = -np.sum(hist * np.log(hist + 1e-10))
+    # Kurtosis - peakedness (high = more concentrated activations)
+    kurtosis = np.mean(((emb - mean_act) / (std_act + 1e-10)) ** 4) - 3
+    return {
+        'l2_norm': float(l2_norm),
+        'mean': float(mean_act),
+        'std': float(std_act),
+        'sparsity': float(sparsity),
+        'entropy': float(entropy),
+        'kurtosis': float(kurtosis)
+    }
 def embed_sequence(sequence, mode="mean", stride=100, layer=21):
     """Extract embeddings from sequence."""
     model = get_embedding_model(layer)
     seq_len = len(sequence)
     embeddings = []
     positions = []
     for start in range(0, seq_len - WINDOW_SIZE + 1, stride):
         window = sequence[start:start + WINDOW_SIZE]
+        tokens = np.expand_dims(tokenize(window), axis=0)
         emb = model.predict(tokens, verbose=0)
         embeddings.append(emb[0])
         positions.append(start)
     embeddings = np.array(embeddings)  # (n_windows, 1000, 768)
     if mode == "mean":
         window_emb = np.mean(embeddings, axis=1)
         return np.mean(window_emb, axis=0), window_emb, positions
     cols = 32
     rows = int(np.ceil(n_dims / cols))
     padded = np.full(rows * cols, np.nan)
     padded[:n_dims] = embedding
     grid = padded.reshape(rows, cols)
     finite = embedding[np.isfinite(embedding)]
+    vmax = max(abs(np.nanmin(finite)), abs(np.nanmax(finite)), 0.01) if finite.size > 0 else 1.0
+    fig, ax = plt.subplots(figsize=(14, max(4, rows * 0.35)))
     im = ax.imshow(grid, cmap='RdBu_r', vmin=-vmax, vmax=vmax, aspect='auto')
+    plt.colorbar(im, ax=ax, shrink=0.8, label='Activation')
+    ax.set_xlabel('Dimension')
+    ax.set_ylabel('Row')
+    ax.set_title(f'{title} ({n_dims} dims)')
     ax.set_xticks(np.arange(0, cols, 8))
     plt.tight_layout()
     return fig
+def create_trajectory_plot(window_embeddings, positions):
+    """Create interactive trajectory heatmap."""
     emb = np.array(window_embeddings)
+    n_windows, n_dims = emb.shape
+    # Subsample dimensions
+    step = max(1, n_dims // 100)
+    emb_sub = emb[:, ::step]
     vmax = max(abs(np.nanmin(emb_sub)), abs(np.nanmax(emb_sub)), 0.01)
+    fig = go.Figure(go.Heatmap(
         z=emb_sub,
         x=list(range(emb_sub.shape[1])),
+        y=[f"{p}" for p in positions],
         colorscale='RdBu_r',
         zmin=-vmax, zmax=vmax,
+        colorbar=dict(title='Act.'),
+        hovertemplate='Pos: %{y} bp<br>Dim: %{x}<br>Val: %{z:.3f}<extra></extra>'
     ))
     fig.update_layout(
+        xaxis=dict(title='Dimension' + (' (subsampled)' if step > 1 else '')),
+        yaxis=dict(title='Window start (bp)'),
+        height=max(350, n_windows * 15 + 100),
+        margin=dict(l=60, r=20, t=30, b=50)
     )
+    return fig
+def create_stats_plot(stats):
+    """Create a bar chart of embedding statistics."""
+    names = ['L2 Norm', 'Mean', 'Std', 'Sparsity', 'Entropy', 'Kurtosis']
+    values = [stats['l2_norm'], stats['mean'], stats['std'],
+              stats['sparsity'], stats['entropy'], stats['kurtosis']]
+    # Normalize for display (different scales)
+    fig = go.Figure()
+    colors = ['#3b82f6', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6', '#ec4899']
+    for i, (name, val) in enumerate(zip(names, values)):
+        fig.add_trace(go.Bar(
+            x=[name], y=[val],
+            name=name,
+            marker_color=colors[i],
+            text=[f'{val:.3f}'],
+            textposition='outside'
+        ))
+    fig.update_layout(
+        showlegend=False,
+        height=280,
+        margin=dict(l=40, r=20, t=30, b=40),
+        yaxis=dict(title='Value')
+    )
     return fig
+def create_dimension_plot(window_embeddings, positions, top_k=8):
+    """Show top varying dimensions."""
     emb = np.array(window_embeddings)
     variances = np.var(emb, axis=0)
     top_dims = np.argsort(variances)[-top_k:][::-1]
+    colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3',
+              '#ff7f00', '#a65628', '#f781bf', '#999999']
+    fig = go.Figure()
     for i, dim in enumerate(top_dims):
         fig.add_trace(go.Scatter(
+            x=positions, y=emb[:, dim],
+            mode='lines', name=f'd{dim}',
+            line=dict(color=colors[i % len(colors)], width=1.5)
         ))
     fig.update_layout(
+        xaxis=dict(title='Position (bp)'),
+        yaxis=dict(title='Activation'),
+        height=300,
+        legend=dict(orientation='h', y=1.1),
+        margin=dict(l=50, r=20, t=40, b=50)
     )
     return fig
+# Example sequence
 EXAMPLE_SEQUENCE = """ATGCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTACGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCG"""
+def process(sequence: str, mode: str, stride: int, layer: int):
     """Main processing function."""
     sequence = strip_fasta_header(sequence.strip())
     is_valid, error = validate_sequence(sequence)
     if not is_valid:
+        return f"**Error**: {error}", None, None, None, None, None
     embedding, window_embeddings, positions = embed_sequence(
         sequence, mode=mode, stride=stride, layer=layer
     path = os.path.join(tempfile.gettempdir(), "embedding.npy")
     np.save(path, embedding)
+    # Compute stats
+    if mode == "per-window":
+        # For per-window, compute stats on mean embedding
+        mean_emb = np.mean(embedding, axis=0)
+        stats = compute_embedding_stats(mean_emb)
+    else:
+        stats = compute_embedding_stats(embedding)
     # Create summary
     if mode == "per-window":
+        summary = f"""### Results
 | | |
 |---|---|
 | sequence | {len(sequence):,} bp |
 | layer | {layer} |
 | windows | {embedding.shape[0]} |
+| shape | {embedding.shape} |
+**Stats** (on mean): L2={stats['l2_norm']:.1f}, entropy={stats['entropy']:.2f}
 """
     else:
+        summary = f"""### Results
 | | |
 |---|---|
 | mode | {mode} |
 | dim | {len(embedding)} |
+**Stats**: L2={stats['l2_norm']:.1f}, entropy={stats['entropy']:.2f}, sparsity={stats['sparsity']:.1%}
 """
     # Create visualizations
     heatmap_fig = None
+    if mode != "per-window":
+        heatmap_fig = create_embedding_heatmap(embedding, f"Layer {layer}")
+    trajectory_fig = create_trajectory_plot(window_embeddings, positions) if len(window_embeddings) > 1 else None
+    stats_fig = create_stats_plot(stats)
+    dims_fig = create_dimension_plot(window_embeddings, positions) if len(window_embeddings) > 1 else None
+    return summary, path, heatmap_fig, trajectory_fig, stats_fig, dims_fig
 # Build interface
+with gr.Blocks(
+    title="BERT Metagenome Embeddings",
+    css=".gradio-container { max-width: 100% !important; }"
+) as demo:
+    gr.Markdown("# bert-embedding\nExtract embeddings from DNA sequences. BERT (430M params) pretrained on metagenomes.")
     with gr.Tab("Extract"):
         with gr.Row():
+            with gr.Column(scale=1, min_width=300):
                 seq_input = gr.Textbox(
                     label="sequence",
+                    placeholder="Paste DNA (FASTA or raw)...",
+                    lines=5,
+                    value=EXAMPLE_SEQUENCE
                 )
                 with gr.Row():
                     mode_input = gr.Radio(
                         choices=["mean", "max", "per-window"],
+                        value="mean", label="pooling"
                     )
                 with gr.Row():
+                    layer_input = gr.Slider(0, 23, value=21, step=1, label="layer")
+                    stride_input = gr.Slider(50, 500, value=100, step=50, label="stride")
                 btn = gr.Button("extract", variant="primary")
                 output = gr.Markdown()
+                download = gr.File(label="download .npy")
+            with gr.Column(scale=1, min_width=300):
+                stats_plot = gr.Plot(label="embedding statistics")
                 heatmap_plot = gr.Plot(label="embedding heatmap")
+            with gr.Column(scale=1, min_width=300):
                 trajectory_plot = gr.Plot(label="window trajectory")
                 dims_plot = gr.Plot(label="top varying dimensions")
     btn.click(
         process,
+        inputs=[seq_input, mode_input, stride_input, layer_input],
+        outputs=[output, download, heatmap_plot, trajectory_plot, stats_plot, dims_plot],
         api_name="embed"
     )
 import numpy as np
 client = Client("genomenet/bert-embedding")
 result = client.predict(
+    sequence="ATGC...",    # min 1000 bp
+    mode="mean",           # mean/max/per-window
     stride=100,
+    layer=21,              # 0-23
     api_name="/embed"
 )
 summary, emb_path, *plots = result
 embedding = np.load(emb_path)
 ```
+**Statistics**:
+- **L2 Norm**: Magnitude of embedding. Higher = stronger model response.
+- **Entropy**: Activation distribution spread. Lower = more structured/confident.
+- **Sparsity**: Fraction of near-zero dims. Higher = sparser representation.
+- **Kurtosis**: Peakedness. Higher = more concentrated activations.
+These can serve as proxy "familiarity" scores - sequences similar to training data
+tend to produce more structured embeddings (lower entropy, higher kurtosis).
         """)
     with gr.Tab("About"):
 |---|---|
 | architecture | BERT, 24 layers, 768 hidden, 12 heads |
 | parameters | ~430M |
+| input | 1000 bp sliding window |
 | pretraining | metagenomic contigs + microbial genomes |
+### Interpreting Statistics
+The embedding statistics provide indirect measures of how the model "responds" to a sequence:
+- **L2 Norm**: Total activation magnitude. Very high or low may indicate unusual sequences.
+- **Entropy**: How spread out the activations are. Lower entropy suggests more confident/structured representation.
+- **Sparsity**: Fraction of dimensions with near-zero activation.
+- **Kurtosis**: How peaked the distribution is. Higher values = more concentrated activations.
+**Note**: These are not direct "familiarity" probabilities, but patterns in these metrics across
+different sequence types may reveal what the model considers typical vs. unusual.
+### Links
 - Model: [genomenet/bert-metagenome](https://huggingface.co/genomenet/bert-metagenome)
+- CRISPR: [genomenet/crispr-array-detection](https://huggingface.co/spaces/genomenet/crispr-array-detection)
         """)
 if __name__ == "__main__":