Spaces:
Sleeping
Rework embedding UI: per-position plot, wider output column, E. coli example
Browse filesThree fixes:
- Default example was 1035 bp -> only 1 window, so trajectory and top-varying-
dimension plots silently rendered as None. Replaced with a 3 kb slice of
E. coli K-12 MG1655 around lacZ (NC_000913.3:365529-368600) -> 21 windows
at the default stride, and real biological structure instead of synthetic
repeats.
- Layout was three equal columns with inputs + results squeezed into the left.
Now: narrow input column on the left, wide output column on the right
containing summary, download, and all plots.
- Replaced the bar chart of global embedding stats (L2/entropy/sparsity/kurtosis
on a single pooled vector) with a per-window plot along the sequence:
L2 norm (response strength) and novelty (1 - cos similarity to sequence mean).
This actually answers "which parts of the sequence does the model respond to
differently?" — spikes/dips correspond to position ranges you can read off
the x-axis. Numeric pooled stats stay in the markdown summary.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
|
@@ -202,32 +202,45 @@ def create_trajectory_plot(window_embeddings, positions):
|
|
| 202 |
)
|
| 203 |
return fig
|
| 204 |
|
| 205 |
-
def
|
| 206 |
-
"""
|
| 207 |
-
names = ['L2 Norm', 'Mean', 'Std', 'Sparsity', 'Entropy', 'Kurtosis']
|
| 208 |
-
values = [stats['l2_norm'], stats['mean'], stats['std'],
|
| 209 |
-
stats['sparsity'], stats['entropy'], stats['kurtosis']]
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
text=[f'{val:.3f}'],
|
| 222 |
-
textposition='outside'
|
| 223 |
-
))
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
fig.update_layout(
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
margin=dict(l=40, r=20, t=30, b=40),
|
| 229 |
-
yaxis=dict(title='Value')
|
| 230 |
)
|
|
|
|
|
|
|
| 231 |
return fig
|
| 232 |
|
| 233 |
def create_dimension_plot(window_embeddings, positions, top_k=8):
|
|
@@ -256,8 +269,55 @@ def create_dimension_plot(window_embeddings, positions, top_k=8):
|
|
| 256 |
)
|
| 257 |
return fig
|
| 258 |
|
| 259 |
-
# Example sequence
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
def process(sequence: str, mode: str, stride: int, layer: int):
|
| 263 |
"""Main processing function."""
|
|
@@ -314,11 +374,12 @@ def process(sequence: str, mode: str, stride: int, layer: int):
|
|
| 314 |
if mode != "per-window":
|
| 315 |
heatmap_fig = create_embedding_heatmap(embedding, f"Layer {layer}")
|
| 316 |
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
|
|
|
| 320 |
|
| 321 |
-
return summary, path, heatmap_fig, trajectory_fig,
|
| 322 |
|
| 323 |
# Build interface
|
| 324 |
with gr.Blocks(
|
|
@@ -329,37 +390,35 @@ with gr.Blocks(
|
|
| 329 |
|
| 330 |
with gr.Tab("Extract"):
|
| 331 |
with gr.Row():
|
| 332 |
-
with gr.Column(scale=1, min_width=
|
| 333 |
seq_input = gr.Textbox(
|
| 334 |
label="sequence",
|
| 335 |
placeholder="Paste DNA (FASTA or raw)...",
|
| 336 |
-
lines=
|
| 337 |
value=EXAMPLE_SEQUENCE
|
| 338 |
)
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
stride_input = gr.Slider(50, 500, value=100, step=50, label="stride")
|
| 347 |
btn = gr.Button("extract", variant="primary")
|
|
|
|
|
|
|
| 348 |
output = gr.Markdown()
|
| 349 |
download = gr.File(label="download .npy")
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
with gr.Column(scale=1, min_width=300):
|
| 356 |
-
trajectory_plot = gr.Plot(label="window trajectory")
|
| 357 |
-
dims_plot = gr.Plot(label="top varying dimensions")
|
| 358 |
|
| 359 |
btn.click(
|
| 360 |
process,
|
| 361 |
inputs=[seq_input, mode_input, stride_input, layer_input],
|
| 362 |
-
outputs=[output, download, heatmap_plot, trajectory_plot,
|
| 363 |
api_name="embed"
|
| 364 |
)
|
| 365 |
|
|
@@ -383,14 +442,12 @@ summary, emb_path, *plots = result
|
|
| 383 |
embedding = np.load(emb_path)
|
| 384 |
```
|
| 385 |
|
| 386 |
-
**
|
| 387 |
-
- **L2
|
| 388 |
-
- **
|
| 389 |
-
|
| 390 |
-
- **Kurtosis**: Peakedness. Higher = more concentrated activations.
|
| 391 |
|
| 392 |
-
|
| 393 |
-
tend to produce more structured embeddings (lower entropy, higher kurtosis).
|
| 394 |
""")
|
| 395 |
|
| 396 |
with gr.Tab("About"):
|
|
|
|
| 202 |
)
|
| 203 |
return fig
|
| 204 |
|
| 205 |
+
def create_familiarity_plot(window_embeddings, positions):
|
| 206 |
+
"""Per-window L2 norm + novelty (cosine distance to sequence mean) along the sequence.
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
+
High L2 norm = strong response. High novelty = window looks different from the rest
|
| 209 |
+
of the sequence (the model's internal 'surprise' relative to the sequence average).
|
| 210 |
+
"""
|
| 211 |
+
from plotly.subplots import make_subplots
|
| 212 |
+
emb = np.array(window_embeddings)
|
| 213 |
+
n_windows = emb.shape[0]
|
| 214 |
|
| 215 |
+
l2 = np.linalg.norm(emb, axis=1)
|
| 216 |
+
mean_vec = emb.mean(axis=0)
|
| 217 |
+
mean_norm = np.linalg.norm(mean_vec) + 1e-10
|
| 218 |
+
cos_sim = (emb @ mean_vec) / (l2 * mean_norm + 1e-10)
|
| 219 |
+
novelty = 1.0 - cos_sim
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
+
fig = make_subplots(
|
| 222 |
+
rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.12,
|
| 223 |
+
subplot_titles=('L2 norm (response strength)', 'Novelty (1 − cosine similarity to mean)')
|
| 224 |
+
)
|
| 225 |
+
fig.add_trace(go.Scatter(
|
| 226 |
+
x=positions, y=l2, mode='lines+markers',
|
| 227 |
+
line=dict(color='#3b82f6', width=2), marker=dict(size=5),
|
| 228 |
+
hovertemplate='pos %{x} bp<br>L2=%{y:.2f}<extra></extra>', showlegend=False
|
| 229 |
+
), row=1, col=1)
|
| 230 |
+
fig.add_trace(go.Scatter(
|
| 231 |
+
x=positions, y=novelty, mode='lines+markers',
|
| 232 |
+
line=dict(color='#ef4444', width=2), marker=dict(size=5),
|
| 233 |
+
hovertemplate='pos %{x} bp<br>novelty=%{y:.3f}<extra></extra>', showlegend=False
|
| 234 |
+
), row=2, col=1)
|
| 235 |
+
fig.update_xaxes(title_text='window start (bp)', row=2, col=1)
|
| 236 |
+
fig.update_yaxes(title_text='L2', row=1, col=1)
|
| 237 |
+
fig.update_yaxes(title_text='1 − cos', row=2, col=1)
|
| 238 |
fig.update_layout(
|
| 239 |
+
height=360 if n_windows > 1 else 260,
|
| 240 |
+
margin=dict(l=50, r=20, t=40, b=40),
|
|
|
|
|
|
|
| 241 |
)
|
| 242 |
+
for ann in fig['layout']['annotations']:
|
| 243 |
+
ann['font'] = dict(size=11)
|
| 244 |
return fig
|
| 245 |
|
| 246 |
def create_dimension_plot(window_embeddings, positions, top_k=8):
|
|
|
|
| 269 |
)
|
| 270 |
return fig
|
| 271 |
|
| 272 |
+
# Example sequence: ~3 kb slice of E. coli K-12 MG1655 around the lacZ operon
|
| 273 |
+
# (NC_000913.3, positions 365529-368600). Covers the lac repressor binding region,
|
| 274 |
+
# the lacZ gene, and flanking regulatory sequence, so per-window plots show
|
| 275 |
+
# real biological structure transitions.
|
| 276 |
+
EXAMPLE_SEQUENCE = (
|
| 277 |
+
"AACTGTTACCCGTAGGTAGTCACGCAACTCGCCGCACATCTGAACTTCAGCCTCCAGTACAGCGCGGCTGAA"
|
| 278 |
+
"ATCATCATTAAAGCGAGTGGCAACATGGAAATCGCTGATTTGTGTAGTCGGTTTATGCAGCAACGAGACGTC"
|
| 279 |
+
"ACGGAAAATGCCGCTCATCCGCCACATATCCTGATCTTCCAGATAACTGCCGTCACTCCAGCGCAGCACCAT"
|
| 280 |
+
"CACCGCGAGGCGGTTTTCTCCGGCGCGTAAAAATGCGCTCAGGTCAAATTCAGACGGCAAACGACTGTCCTG"
|
| 281 |
+
"GCCGTAACCGACCCAGCGCCCGTTGCACCACAGATGAAACGCCGAGTTAACGCCATCAAAAATAATTCGCGT"
|
| 282 |
+
"CTGGCCTTCCTGTAGCCAGCTTTCATCAACATTAAATGTGAGCGAGTAACAACCCGTCGGATTCTCCGTGGG"
|
| 283 |
+
"AACAAACGGCGGATTGACCGTAATGGGATAGGTCACGTTGGTGTAGATGGGCGCATCGTAACCGTGCATCTG"
|
| 284 |
+
"CCAGTTTGAGGGGACGACGACAGTATCGGCCTCAGGAAGATCGCACTCCAGCCAGCTTTCCGGCACCGCTTC"
|
| 285 |
+
"TGGTGCCGGAAACCAGGCAAAGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCG"
|
| 286 |
+
"GGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGG"
|
| 287 |
+
"GTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAATCCGTAATCATGGTCATAGCTGTTTCCTGTGT"
|
| 288 |
+
"GAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCT"
|
| 289 |
+
"AATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCC"
|
| 290 |
+
"AGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCCAGGGTGGTTTTTCTT"
|
| 291 |
+
"TTCACCAGTGAGACGGGCAACAGCTGATTGCCCTTCACCGCCTGGCCCTGAGAGAGTTGCAGCAAGCGGTCC"
|
| 292 |
+
"ACGCTGGTTTGCCCCAGCAGGCGAAAATCCTGTTTGATGGTGGTTAACGGCGGGATATAACATGAGCTGTCT"
|
| 293 |
+
"TCGGTATCGTCGTATCCCACTACCGAGATATCCGCACCAACGCGCAGCCCGGACTCGGTAATGGCGCGCATT"
|
| 294 |
+
"GCGCCCAGCGCCATCTGATCGTTGGCAACCAGCATCGCAGTGGGAACGATGCCCTCATTCAGCATTTGCATG"
|
| 295 |
+
"GTTTGTTGAAAACCGGACATGGCACTCCAGTCGCCTTCCCGTTCCGCTATCGGCTGAATTTGATTGCGAGTG"
|
| 296 |
+
"AGATATTTATGCCAGCCAGCCAGACGCAGACGCGCCGAGACAGAACTTAATGGGCCCGCTAACAGCGCGATT"
|
| 297 |
+
"TGCTGGTGACCCAATGCGACCAGATGCTCCACGCCCAGTCGCGTACCGTCTTCATGGGAGAAAATAATACTG"
|
| 298 |
+
"TTGATGGGTGTCTGGTCAGAGACATCAAGAAATAACGCCGGAACATTAGTGCAGGCAGCTTCCACAGCAATG"
|
| 299 |
+
"GCATCCTGGTCATCCAGCGGATAGTTAATGATCAGCCCACTGACGCGTTGCGCGAGAAGATTGTGCACCGCC"
|
| 300 |
+
"GCTTTACAGGCTTCGACGCCGCTTCGTTCTACCATCGACACCACCACGCTGGCACCCAGTTGATCGGCGCGA"
|
| 301 |
+
"GATTTAATCGCCGCGACAATTTGCGACGGCGCGTGCAGGGCCAGACTGGAGGTGGCAACGCCAATCAGCAAC"
|
| 302 |
+
"GACTGTTTGCCCGCCAGTTGTTGTGCCACGCGGTTGGGAATGTAATTCAGCTCCGCCATCGCCGCTTCCACT"
|
| 303 |
+
"TTTTCCCGCGTTTTCGCAGAAACGTGGCTGGCCTGGTTCACCACGCGGGAAACGGTCTGATAAGAGACACCG"
|
| 304 |
+
"GCATACTCTGCGACATCGTATAACGTTACTGGTTTCACATTCACCACCCTGAATTGACTCTCTTCCGGGCGC"
|
| 305 |
+
"TATCATGCCATACCGCGAAAGGTTTTGCGCCATTCGATGGTGTCAACGTAAATGCATGCCGCTTCGCCTTCC"
|
| 306 |
+
"GGCCACCAGAATAGCCTGCGATTCAACCCCTTCTTCGATCTGTTTTGCTACCCGTTGTAGCGCCGGAAGATG"
|
| 307 |
+
"CTTTTCCGCTGCCTGTTCAATGGTCATTGCGCTCGCCATATACACCAGATTCAGACAGCCAATCACCCGTTG"
|
| 308 |
+
"TTCACTGCGCAGCGGTACGGCGATAGAGGCGATCTTCTCCTCCTGATCCCAGCCGCGGTAGTTCTGTCCGTA"
|
| 309 |
+
"ACCCTCTTTGCGCGCGCGCGCCAGAATGGCTTCCAGCTTTAACGGTTCCCGTGCCAGTTGATAGTCATCACC"
|
| 310 |
+
"GGGGCGGGAGGCTAACATTTCGATTAATTCCTTGCGGTCTTGTTCCGGGCAAAAGGCCAGCCAGGTCAGGCC"
|
| 311 |
+
"CGAGGCGGTTTTCAGAAGCGGCAAACGTCGCCCGACCATTGCCCGGTGAAAGGATAAGCGGCTGAAACGGTG"
|
| 312 |
+
"AGTGGTTTCGCGTACCACCATTGCATCAACATCCAGCGTGGACACATCTGTCGGCCATACCACTTCGCGCAA"
|
| 313 |
+
"CAGATCGCCCAGCAGTGGGGCCGCCAGTGCAGAAATCCACTGTTCGTCACGAAATCCTTCGCTTAATTGCCG"
|
| 314 |
+
"CACTTTGATGGTCAGTCGAAAACTATCATCGGAGGGGCTACGGCGGACATATCCCTCTTCCTGCAGCGTCTC"
|
| 315 |
+
"CAGCAGTCGCCGCACAGTGGTGCGATGCAGGCCGCTGAGTTCCGCCAGCAGCCCGACGCTGGCACCGCCATC"
|
| 316 |
+
"AAGTTTATTTAACATATTTAATAACATTAGACCGCGGGTTAAGCCGCGCACGGTTTTGTATTCCGTCTGCTC"
|
| 317 |
+
"ATTGTTCTGCATATTAATTGACATTTCTATAGTTAAAACAACGTGGTGCACCTGGTGCACATTCGGGCATGT"
|
| 318 |
+
"TTTGATTGTAGCCGAAAACACCCTTCCTATACTGAGCGCACAATAAAAAATCATTTACATGTTTTTAACAAA"
|
| 319 |
+
"ATAAGTTGCGCTGTACTGTGCGCGCAACGACATTTTGTCCGAGTCGTG"
|
| 320 |
+
)
|
| 321 |
|
| 322 |
def process(sequence: str, mode: str, stride: int, layer: int):
|
| 323 |
"""Main processing function."""
|
|
|
|
| 374 |
if mode != "per-window":
|
| 375 |
heatmap_fig = create_embedding_heatmap(embedding, f"Layer {layer}")
|
| 376 |
|
| 377 |
+
multi_window = len(window_embeddings) > 1
|
| 378 |
+
trajectory_fig = create_trajectory_plot(window_embeddings, positions) if multi_window else None
|
| 379 |
+
familiarity_fig = create_familiarity_plot(window_embeddings, positions) if multi_window else None
|
| 380 |
+
dims_fig = create_dimension_plot(window_embeddings, positions) if multi_window else None
|
| 381 |
|
| 382 |
+
return summary, path, heatmap_fig, trajectory_fig, familiarity_fig, dims_fig
|
| 383 |
|
| 384 |
# Build interface
|
| 385 |
with gr.Blocks(
|
|
|
|
| 390 |
|
| 391 |
with gr.Tab("Extract"):
|
| 392 |
with gr.Row():
|
| 393 |
+
with gr.Column(scale=1, min_width=260):
|
| 394 |
seq_input = gr.Textbox(
|
| 395 |
label="sequence",
|
| 396 |
placeholder="Paste DNA (FASTA or raw)...",
|
| 397 |
+
lines=8,
|
| 398 |
value=EXAMPLE_SEQUENCE
|
| 399 |
)
|
| 400 |
+
mode_input = gr.Radio(
|
| 401 |
+
choices=["mean", "max", "per-window"],
|
| 402 |
+
value="mean", label="pooling"
|
| 403 |
+
)
|
| 404 |
+
layer_input = gr.Slider(0, 23, value=21, step=1, label="layer")
|
| 405 |
+
stride_input = gr.Slider(50, 500, value=100, step=50, label="stride",
|
| 406 |
+
info="lower = finer resolution, more compute")
|
|
|
|
| 407 |
btn = gr.Button("extract", variant="primary")
|
| 408 |
+
|
| 409 |
+
with gr.Column(scale=3, min_width=500):
|
| 410 |
output = gr.Markdown()
|
| 411 |
download = gr.File(label="download .npy")
|
| 412 |
+
familiarity_plot = gr.Plot(label="per-window response & novelty along sequence")
|
| 413 |
+
with gr.Row():
|
| 414 |
+
trajectory_plot = gr.Plot(label="window trajectory")
|
| 415 |
+
dims_plot = gr.Plot(label="top varying dimensions")
|
| 416 |
+
heatmap_plot = gr.Plot(label="pooled embedding heatmap")
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
btn.click(
|
| 419 |
process,
|
| 420 |
inputs=[seq_input, mode_input, stride_input, layer_input],
|
| 421 |
+
outputs=[output, download, heatmap_plot, trajectory_plot, familiarity_plot, dims_plot],
|
| 422 |
api_name="embed"
|
| 423 |
)
|
| 424 |
|
|
|
|
| 442 |
embedding = np.load(emb_path)
|
| 443 |
```
|
| 444 |
|
| 445 |
+
**Per-window plots** (along sequence position):
|
| 446 |
+
- **L2 norm**: activation magnitude — high = strong, structured response.
|
| 447 |
+
- **Novelty** (1 − cosine similarity to mean embedding): how much the window differs
|
| 448 |
+
from the rest of the sequence. Spikes = unusual regions relative to context.
|
|
|
|
| 449 |
|
| 450 |
+
Numeric stats (L2, entropy, sparsity, kurtosis) are in the summary text.
|
|
|
|
| 451 |
""")
|
| 452 |
|
| 453 |
with gr.Tab("About"):
|