genomenet Claude Opus 4.7 (1M context) commited on
Commit
f48b1be
·
1 Parent(s): 09774a9

Rework embedding UI: per-position plot, wider output column, E. coli example

Browse files

Three fixes:
- Default example was 1035 bp -> only 1 window, so trajectory and top-varying-
dimension plots silently rendered as None. Replaced with a 3 kb slice of
E. coli K-12 MG1655 around lacZ (NC_000913.3:365529-368600) -> 21 windows
at the default stride, and real biological structure instead of synthetic
repeats.
- Layout was three equal columns with inputs + results squeezed into the left.
Now: narrow input column on the left, wide output column on the right
containing summary, download, and all plots.
- Replaced the bar chart of global embedding stats (L2/entropy/sparsity/kurtosis
on a single pooled vector) with a per-window plot along the sequence:
L2 norm (response strength) and novelty (1 - cos similarity to sequence mean).
This actually answers "which parts of the sequence does the model respond to
differently?" — spikes/dips correspond to position ranges you can read off
the x-axis. Numeric pooled stats stay in the markdown summary.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +110 -53
app.py CHANGED
@@ -202,32 +202,45 @@ def create_trajectory_plot(window_embeddings, positions):
202
  )
203
  return fig
204
 
205
- def create_stats_plot(stats):
206
- """Create a bar chart of embedding statistics."""
207
- names = ['L2 Norm', 'Mean', 'Std', 'Sparsity', 'Entropy', 'Kurtosis']
208
- values = [stats['l2_norm'], stats['mean'], stats['std'],
209
- stats['sparsity'], stats['entropy'], stats['kurtosis']]
210
 
211
- # Normalize for display (different scales)
212
- fig = go.Figure()
213
-
214
- colors = ['#3b82f6', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6', '#ec4899']
 
 
215
 
216
- for i, (name, val) in enumerate(zip(names, values)):
217
- fig.add_trace(go.Bar(
218
- x=[name], y=[val],
219
- name=name,
220
- marker_color=colors[i],
221
- text=[f'{val:.3f}'],
222
- textposition='outside'
223
- ))
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  fig.update_layout(
226
- showlegend=False,
227
- height=280,
228
- margin=dict(l=40, r=20, t=30, b=40),
229
- yaxis=dict(title='Value')
230
  )
 
 
231
  return fig
232
 
233
  def create_dimension_plot(window_embeddings, positions, top_k=8):
@@ -256,8 +269,55 @@ def create_dimension_plot(window_embeddings, positions, top_k=8):
256
  )
257
  return fig
258
 
259
- # Example sequence
260
- EXAMPLE_SEQUENCE = """ATGCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTACGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCG"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  def process(sequence: str, mode: str, stride: int, layer: int):
263
  """Main processing function."""
@@ -314,11 +374,12 @@ def process(sequence: str, mode: str, stride: int, layer: int):
314
  if mode != "per-window":
315
  heatmap_fig = create_embedding_heatmap(embedding, f"Layer {layer}")
316
 
317
- trajectory_fig = create_trajectory_plot(window_embeddings, positions) if len(window_embeddings) > 1 else None
318
- stats_fig = create_stats_plot(stats)
319
- dims_fig = create_dimension_plot(window_embeddings, positions) if len(window_embeddings) > 1 else None
 
320
 
321
- return summary, path, heatmap_fig, trajectory_fig, stats_fig, dims_fig
322
 
323
  # Build interface
324
  with gr.Blocks(
@@ -329,37 +390,35 @@ with gr.Blocks(
329
 
330
  with gr.Tab("Extract"):
331
  with gr.Row():
332
- with gr.Column(scale=1, min_width=300):
333
  seq_input = gr.Textbox(
334
  label="sequence",
335
  placeholder="Paste DNA (FASTA or raw)...",
336
- lines=5,
337
  value=EXAMPLE_SEQUENCE
338
  )
339
- with gr.Row():
340
- mode_input = gr.Radio(
341
- choices=["mean", "max", "per-window"],
342
- value="mean", label="pooling"
343
- )
344
- with gr.Row():
345
- layer_input = gr.Slider(0, 23, value=21, step=1, label="layer")
346
- stride_input = gr.Slider(50, 500, value=100, step=50, label="stride")
347
  btn = gr.Button("extract", variant="primary")
 
 
348
  output = gr.Markdown()
349
  download = gr.File(label="download .npy")
350
-
351
- with gr.Column(scale=1, min_width=300):
352
- stats_plot = gr.Plot(label="embedding statistics")
353
- heatmap_plot = gr.Plot(label="embedding heatmap")
354
-
355
- with gr.Column(scale=1, min_width=300):
356
- trajectory_plot = gr.Plot(label="window trajectory")
357
- dims_plot = gr.Plot(label="top varying dimensions")
358
 
359
  btn.click(
360
  process,
361
  inputs=[seq_input, mode_input, stride_input, layer_input],
362
- outputs=[output, download, heatmap_plot, trajectory_plot, stats_plot, dims_plot],
363
  api_name="embed"
364
  )
365
 
@@ -383,14 +442,12 @@ summary, emb_path, *plots = result
383
  embedding = np.load(emb_path)
384
  ```
385
 
386
- **Statistics**:
387
- - **L2 Norm**: Magnitude of embedding. Higher = stronger model response.
388
- - **Entropy**: Activation distribution spread. Lower = more structured/confident.
389
- - **Sparsity**: Fraction of near-zero dims. Higher = sparser representation.
390
- - **Kurtosis**: Peakedness. Higher = more concentrated activations.
391
 
392
- These can serve as proxy "familiarity" scores - sequences similar to training data
393
- tend to produce more structured embeddings (lower entropy, higher kurtosis).
394
  """)
395
 
396
  with gr.Tab("About"):
 
202
  )
203
  return fig
204
 
205
+ def create_familiarity_plot(window_embeddings, positions):
206
+ """Per-window L2 norm + novelty (cosine distance to sequence mean) along the sequence.
 
 
 
207
 
208
+ High L2 norm = strong response. High novelty = window looks different from the rest
209
+ of the sequence (the model's internal 'surprise' relative to the sequence average).
210
+ """
211
+ from plotly.subplots import make_subplots
212
+ emb = np.array(window_embeddings)
213
+ n_windows = emb.shape[0]
214
 
215
+ l2 = np.linalg.norm(emb, axis=1)
216
+ mean_vec = emb.mean(axis=0)
217
+ mean_norm = np.linalg.norm(mean_vec) + 1e-10
218
+ cos_sim = (emb @ mean_vec) / (l2 * mean_norm + 1e-10)
219
+ novelty = 1.0 - cos_sim
 
 
 
220
 
221
+ fig = make_subplots(
222
+ rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.12,
223
+ subplot_titles=('L2 norm (response strength)', 'Novelty (1 − cosine similarity to mean)')
224
+ )
225
+ fig.add_trace(go.Scatter(
226
+ x=positions, y=l2, mode='lines+markers',
227
+ line=dict(color='#3b82f6', width=2), marker=dict(size=5),
228
+ hovertemplate='pos %{x} bp<br>L2=%{y:.2f}<extra></extra>', showlegend=False
229
+ ), row=1, col=1)
230
+ fig.add_trace(go.Scatter(
231
+ x=positions, y=novelty, mode='lines+markers',
232
+ line=dict(color='#ef4444', width=2), marker=dict(size=5),
233
+ hovertemplate='pos %{x} bp<br>novelty=%{y:.3f}<extra></extra>', showlegend=False
234
+ ), row=2, col=1)
235
+ fig.update_xaxes(title_text='window start (bp)', row=2, col=1)
236
+ fig.update_yaxes(title_text='L2', row=1, col=1)
237
+ fig.update_yaxes(title_text='1 − cos', row=2, col=1)
238
  fig.update_layout(
239
+ height=360 if n_windows > 1 else 260,
240
+ margin=dict(l=50, r=20, t=40, b=40),
 
 
241
  )
242
+ for ann in fig['layout']['annotations']:
243
+ ann['font'] = dict(size=11)
244
  return fig
245
 
246
  def create_dimension_plot(window_embeddings, positions, top_k=8):
 
269
  )
270
  return fig
271
 
272
+ # Example sequence: ~3 kb slice of E. coli K-12 MG1655 around the lacZ operon
273
+ # (NC_000913.3, positions 365529-368600). Covers the lac repressor binding region,
274
+ # the lacZ gene, and flanking regulatory sequence, so per-window plots show
275
+ # real biological structure transitions.
276
+ EXAMPLE_SEQUENCE = (
277
+ "AACTGTTACCCGTAGGTAGTCACGCAACTCGCCGCACATCTGAACTTCAGCCTCCAGTACAGCGCGGCTGAA"
278
+ "ATCATCATTAAAGCGAGTGGCAACATGGAAATCGCTGATTTGTGTAGTCGGTTTATGCAGCAACGAGACGTC"
279
+ "ACGGAAAATGCCGCTCATCCGCCACATATCCTGATCTTCCAGATAACTGCCGTCACTCCAGCGCAGCACCAT"
280
+ "CACCGCGAGGCGGTTTTCTCCGGCGCGTAAAAATGCGCTCAGGTCAAATTCAGACGGCAAACGACTGTCCTG"
281
+ "GCCGTAACCGACCCAGCGCCCGTTGCACCACAGATGAAACGCCGAGTTAACGCCATCAAAAATAATTCGCGT"
282
+ "CTGGCCTTCCTGTAGCCAGCTTTCATCAACATTAAATGTGAGCGAGTAACAACCCGTCGGATTCTCCGTGGG"
283
+ "AACAAACGGCGGATTGACCGTAATGGGATAGGTCACGTTGGTGTAGATGGGCGCATCGTAACCGTGCATCTG"
284
+ "CCAGTTTGAGGGGACGACGACAGTATCGGCCTCAGGAAGATCGCACTCCAGCCAGCTTTCCGGCACCGCTTC"
285
+ "TGGTGCCGGAAACCAGGCAAAGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCG"
286
+ "GGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGG"
287
+ "GTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAATCCGTAATCATGGTCATAGCTGTTTCCTGTGT"
288
+ "GAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCT"
289
+ "AATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCC"
290
+ "AGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCCAGGGTGGTTTTTCTT"
291
+ "TTCACCAGTGAGACGGGCAACAGCTGATTGCCCTTCACCGCCTGGCCCTGAGAGAGTTGCAGCAAGCGGTCC"
292
+ "ACGCTGGTTTGCCCCAGCAGGCGAAAATCCTGTTTGATGGTGGTTAACGGCGGGATATAACATGAGCTGTCT"
293
+ "TCGGTATCGTCGTATCCCACTACCGAGATATCCGCACCAACGCGCAGCCCGGACTCGGTAATGGCGCGCATT"
294
+ "GCGCCCAGCGCCATCTGATCGTTGGCAACCAGCATCGCAGTGGGAACGATGCCCTCATTCAGCATTTGCATG"
295
+ "GTTTGTTGAAAACCGGACATGGCACTCCAGTCGCCTTCCCGTTCCGCTATCGGCTGAATTTGATTGCGAGTG"
296
+ "AGATATTTATGCCAGCCAGCCAGACGCAGACGCGCCGAGACAGAACTTAATGGGCCCGCTAACAGCGCGATT"
297
+ "TGCTGGTGACCCAATGCGACCAGATGCTCCACGCCCAGTCGCGTACCGTCTTCATGGGAGAAAATAATACTG"
298
+ "TTGATGGGTGTCTGGTCAGAGACATCAAGAAATAACGCCGGAACATTAGTGCAGGCAGCTTCCACAGCAATG"
299
+ "GCATCCTGGTCATCCAGCGGATAGTTAATGATCAGCCCACTGACGCGTTGCGCGAGAAGATTGTGCACCGCC"
300
+ "GCTTTACAGGCTTCGACGCCGCTTCGTTCTACCATCGACACCACCACGCTGGCACCCAGTTGATCGGCGCGA"
301
+ "GATTTAATCGCCGCGACAATTTGCGACGGCGCGTGCAGGGCCAGACTGGAGGTGGCAACGCCAATCAGCAAC"
302
+ "GACTGTTTGCCCGCCAGTTGTTGTGCCACGCGGTTGGGAATGTAATTCAGCTCCGCCATCGCCGCTTCCACT"
303
+ "TTTTCCCGCGTTTTCGCAGAAACGTGGCTGGCCTGGTTCACCACGCGGGAAACGGTCTGATAAGAGACACCG"
304
+ "GCATACTCTGCGACATCGTATAACGTTACTGGTTTCACATTCACCACCCTGAATTGACTCTCTTCCGGGCGC"
305
+ "TATCATGCCATACCGCGAAAGGTTTTGCGCCATTCGATGGTGTCAACGTAAATGCATGCCGCTTCGCCTTCC"
306
+ "GGCCACCAGAATAGCCTGCGATTCAACCCCTTCTTCGATCTGTTTTGCTACCCGTTGTAGCGCCGGAAGATG"
307
+ "CTTTTCCGCTGCCTGTTCAATGGTCATTGCGCTCGCCATATACACCAGATTCAGACAGCCAATCACCCGTTG"
308
+ "TTCACTGCGCAGCGGTACGGCGATAGAGGCGATCTTCTCCTCCTGATCCCAGCCGCGGTAGTTCTGTCCGTA"
309
+ "ACCCTCTTTGCGCGCGCGCGCCAGAATGGCTTCCAGCTTTAACGGTTCCCGTGCCAGTTGATAGTCATCACC"
310
+ "GGGGCGGGAGGCTAACATTTCGATTAATTCCTTGCGGTCTTGTTCCGGGCAAAAGGCCAGCCAGGTCAGGCC"
311
+ "CGAGGCGGTTTTCAGAAGCGGCAAACGTCGCCCGACCATTGCCCGGTGAAAGGATAAGCGGCTGAAACGGTG"
312
+ "AGTGGTTTCGCGTACCACCATTGCATCAACATCCAGCGTGGACACATCTGTCGGCCATACCACTTCGCGCAA"
313
+ "CAGATCGCCCAGCAGTGGGGCCGCCAGTGCAGAAATCCACTGTTCGTCACGAAATCCTTCGCTTAATTGCCG"
314
+ "CACTTTGATGGTCAGTCGAAAACTATCATCGGAGGGGCTACGGCGGACATATCCCTCTTCCTGCAGCGTCTC"
315
+ "CAGCAGTCGCCGCACAGTGGTGCGATGCAGGCCGCTGAGTTCCGCCAGCAGCCCGACGCTGGCACCGCCATC"
316
+ "AAGTTTATTTAACATATTTAATAACATTAGACCGCGGGTTAAGCCGCGCACGGTTTTGTATTCCGTCTGCTC"
317
+ "ATTGTTCTGCATATTAATTGACATTTCTATAGTTAAAACAACGTGGTGCACCTGGTGCACATTCGGGCATGT"
318
+ "TTTGATTGTAGCCGAAAACACCCTTCCTATACTGAGCGCACAATAAAAAATCATTTACATGTTTTTAACAAA"
319
+ "ATAAGTTGCGCTGTACTGTGCGCGCAACGACATTTTGTCCGAGTCGTG"
320
+ )
321
 
322
  def process(sequence: str, mode: str, stride: int, layer: int):
323
  """Main processing function."""
 
374
  if mode != "per-window":
375
  heatmap_fig = create_embedding_heatmap(embedding, f"Layer {layer}")
376
 
377
+ multi_window = len(window_embeddings) > 1
378
+ trajectory_fig = create_trajectory_plot(window_embeddings, positions) if multi_window else None
379
+ familiarity_fig = create_familiarity_plot(window_embeddings, positions) if multi_window else None
380
+ dims_fig = create_dimension_plot(window_embeddings, positions) if multi_window else None
381
 
382
+ return summary, path, heatmap_fig, trajectory_fig, familiarity_fig, dims_fig
383
 
384
  # Build interface
385
  with gr.Blocks(
 
390
 
391
  with gr.Tab("Extract"):
392
  with gr.Row():
393
+ with gr.Column(scale=1, min_width=260):
394
  seq_input = gr.Textbox(
395
  label="sequence",
396
  placeholder="Paste DNA (FASTA or raw)...",
397
+ lines=8,
398
  value=EXAMPLE_SEQUENCE
399
  )
400
+ mode_input = gr.Radio(
401
+ choices=["mean", "max", "per-window"],
402
+ value="mean", label="pooling"
403
+ )
404
+ layer_input = gr.Slider(0, 23, value=21, step=1, label="layer")
405
+ stride_input = gr.Slider(50, 500, value=100, step=50, label="stride",
406
+ info="lower = finer resolution, more compute")
 
407
  btn = gr.Button("extract", variant="primary")
408
+
409
+ with gr.Column(scale=3, min_width=500):
410
  output = gr.Markdown()
411
  download = gr.File(label="download .npy")
412
+ familiarity_plot = gr.Plot(label="per-window response & novelty along sequence")
413
+ with gr.Row():
414
+ trajectory_plot = gr.Plot(label="window trajectory")
415
+ dims_plot = gr.Plot(label="top varying dimensions")
416
+ heatmap_plot = gr.Plot(label="pooled embedding heatmap")
 
 
 
417
 
418
  btn.click(
419
  process,
420
  inputs=[seq_input, mode_input, stride_input, layer_input],
421
+ outputs=[output, download, heatmap_plot, trajectory_plot, familiarity_plot, dims_plot],
422
  api_name="embed"
423
  )
424
 
 
442
  embedding = np.load(emb_path)
443
  ```
444
 
445
+ **Per-window plots** (along sequence position):
446
+ - **L2 norm**: activation magnitude high = strong, structured response.
447
+ - **Novelty** (1 cosine similarity to mean embedding): how much the window differs
448
+ from the rest of the sequence. Spikes = unusual regions relative to context.
 
449
 
450
+ Numeric stats (L2, entropy, sparsity, kurtosis) are in the summary text.
 
451
  """)
452
 
453
  with gr.Tab("About"):