Valmbd commited on
Commit
2d07a5e
Β·
verified Β·
1 Parent(s): ee42d0e

[10:35 UTC] AA sequence viewer + rich correlation + fix fillcolor/statsmodels/disp_profile

Browse files
app/pages/1_πŸ”_Explorer.py CHANGED
@@ -14,7 +14,7 @@ from app.utils.data_loader import (
14
  find_predictions_dir, load_prediction_index, load_modes, load_embeddings,
15
  load_ground_truth, load_pdb_text, PETIMOT_ROOT
16
  )
17
- from app.utils.bio_api import get_protein_mutations
18
  from app.components.embedding_viewer import render_embedding_viewer
19
  from app.components.viewer_3d import render_motion_viewer, render_mode_comparison, render_deformed_viewer, render_animated_viewer, render_pred_vs_gt_viewer
20
  from app.components.sequence_viewer import render_sequence_viewer, render_displacement_chart
@@ -68,11 +68,21 @@ def render_protein_detail(pred_dir, gt_dir, protein_name, key_suffix="", compact
68
  return
69
 
70
  n_res = len(list(modes.values())[0])
71
- seq = gt.get("seq", "X" * n_res) if gt else "X" * n_res
72
  ca = gt["bb"][:, 1] if gt and "bb" in gt else np.zeros((n_res, 3))
73
  coverage = gt.get("coverage", np.ones(n_res)) if gt else np.ones(n_res)
74
  eigenvalues = gt.get("eigvals", None) if gt else None
75
- pdb_text = None
 
 
 
 
 
 
 
 
 
 
76
 
77
  pdb_path = os.path.join(PETIMOT_ROOT, "pdbs", f"{protein_name}.pdb")
78
  if os.path.exists(pdb_path):
@@ -355,7 +365,9 @@ with col_dl:
355
  # ═══════════════════════════════════════
356
  if view_mode == "πŸ” Browse":
357
  cols_to_show = ["name", "seq_len", "n_modes", "mean_disp_m0", "max_disp_m0", "top_residue"]
358
- if "disp_profile" in df_filtered.columns:
 
 
359
  cols_to_show.append("disp_profile")
360
 
361
  selected_idx = st.dataframe(
 
14
  find_predictions_dir, load_prediction_index, load_modes, load_embeddings,
15
  load_ground_truth, load_pdb_text, PETIMOT_ROOT
16
  )
17
+ from app.utils.bio_api import get_protein_mutations, get_sequence_from_pdb, render_sequence_aa
18
  from app.components.embedding_viewer import render_embedding_viewer
19
  from app.components.viewer_3d import render_motion_viewer, render_mode_comparison, render_deformed_viewer, render_animated_viewer, render_pred_vs_gt_viewer
20
  from app.components.sequence_viewer import render_sequence_viewer, render_displacement_chart
 
68
  return
69
 
70
  n_res = len(list(modes.values())[0])
71
+ seq = gt.get("seq", "") if gt else ""
72
  ca = gt["bb"][:, 1] if gt and "bb" in gt else np.zeros((n_res, 3))
73
  coverage = gt.get("coverage", np.ones(n_res)) if gt else np.ones(n_res)
74
  eigenvalues = gt.get("eigvals", None) if gt else None
75
+
76
+ # ── Amino acid sequence strip ────────────────────────────────────
77
+ if not seq or len(seq) < 3:
78
+ with st.spinner("Fetching sequence from RCSB..."):
79
+ seq = get_sequence_from_pdb(protein_name) or "X" * n_res
80
+ mode0_vecs = list(modes.values())[0]
81
+ mode0_mags = np.linalg.norm(mode0_vecs, axis=1) if mode0_vecs.ndim > 1 else np.abs(mode0_vecs)
82
+ mutations = get_protein_mutations(protein_name, n_res)
83
+ render_sequence_aa(seq[:n_res], displacements=mode0_mags, mutations=mutations,
84
+ title=f"πŸ”€ {protein_name} β€” sequence (opacity = mode 0 displacement)")
85
+ st.divider()
86
 
87
  pdb_path = os.path.join(PETIMOT_ROOT, "pdbs", f"{protein_name}.pdb")
88
  if os.path.exists(pdb_path):
 
365
  # ═══════════════════════════════════════
366
  if view_mode == "πŸ” Browse":
367
  cols_to_show = ["name", "seq_len", "n_modes", "mean_disp_m0", "max_disp_m0", "top_residue"]
368
+ has_profiles = ("disp_profile" in df_filtered.columns and
369
+ df_filtered["disp_profile"].apply(lambda x: len(x) if isinstance(x, list) else 0).max() > 0)
370
+ if has_profiles:
371
  cols_to_show.append("disp_profile")
372
 
373
  selected_idx = st.dataframe(
app/pages/3_πŸ“Š_Statistics.py CHANGED
@@ -185,69 +185,138 @@ fig.update_layout(**PLOT_LAYOUT, height=400, showlegend=False)
185
  st.plotly_chart(fig, use_container_width=True, key="violins")
186
 
187
 
188
- # ═══════════════════════════════════════
189
- # SECTION 2: Correlation Heatmap + Scatter
190
- # ═══════════════════════════════════════
191
- st.markdown('<div class="section-header">πŸ”— Prediction Correlation Analysis <span style="color:#6366f1">[PREDICTIONS]</span></div>', unsafe_allow_html=True)
192
-
193
- col_heat, col_scatter = st.columns([1, 2])
194
-
195
- with col_heat:
196
- _all_corr = {"seq_len": "Seq Length", "mean_disp_m0": "Mean Ξ”",
197
- "max_disp_m0": "Max Ξ”", "top_residue": "Top Residue",
198
- "n_modes": "# Modes"}
199
- corr_cols = [c for c in _all_corr if c in df.columns and df[c].notna().sum() > 5]
200
- labels = [_all_corr[c] for c in corr_cols]
201
- corr_matrix = df[corr_cols].dropna().corr().fillna(0)
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  fig_h = go.Figure(data=go.Heatmap(
204
- z=corr_matrix.values,
205
- x=labels, y=labels,
206
  colorscale=[[0, "#1e1b4b"], [0.5, "#4338ca"], [1, "#ec4899"]],
207
- text=np.round(corr_matrix.values, 2),
208
  texttemplate="%{text}",
209
- textfont=dict(size=13, color="white"),
210
  hovertemplate="<b>%{x}</b> vs <b>%{y}</b><br>r = %{z:.3f}<extra></extra>",
211
  zmin=-1, zmax=1,
212
  colorbar=dict(title="r", tickfont=dict(color="#a5b4fc")),
213
  ))
214
- fig_h.update_layout(**PLOT_LAYOUT, height=380, title="Feature Correlation")
 
215
  st.plotly_chart(fig_h, use_container_width=True, key="heatmap")
216
 
217
- with col_scatter:
218
- fig_s = go.Figure()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- # 2D density contour
 
 
221
  fig_s.add_trace(go.Histogram2dContour(
222
  x=df.seq_len, y=df.mean_disp_m0,
223
  colorscale=[[0, "rgba(30,27,75,0)"], [0.3, "rgba(99,102,241,0.3)"],
224
  [0.6, "rgba(139,92,246,0.5)"], [1, "rgba(236,72,153,0.7)"]],
225
- ncontours=15,
226
- showscale=False,
227
- hoverinfo="skip",
228
  ))
229
- # Scatter overlay
230
  fig_s.add_trace(go.Scattergl(
231
- x=df.seq_len, y=df.mean_disp_m0,
232
- mode="markers",
233
- marker=dict(
234
- size=3, color=df.max_disp_m0,
235
- colorscale="Viridis", showscale=True,
236
- colorbar=dict(title="Max Ξ” (Γ…)", tickfont=dict(color="#a5b4fc")),
237
- opacity=0.6,
238
- ),
239
  text=df.name,
240
  hovertemplate="<b>%{text}</b><br>Length: %{x}<br>Mean Ξ”: %{y:.3f} Γ…<extra></extra>",
241
  ))
242
- fig_s.update_layout(
243
- **PLOT_LAYOUT, height=380, showlegend=False,
244
- title="Sequence Length vs Mean Displacement",
245
- xaxis_title="Sequence Length",
246
- yaxis_title="Mean Displacement (Γ…)",
247
- )
248
  st.plotly_chart(fig_s, use_container_width=True, key="scatter")
249
 
250
 
 
251
  # ═══════════════════════════════════════
252
  # SECTION 3: Top-10 Leaderboards
253
  # ═══════════════════════════════════════
 
185
  st.plotly_chart(fig, use_container_width=True, key="violins")
186
 
187
 
188
+ # Load the richer merged dataset (protein_stats.csv has 19 structural features + PETIMOT metrics)
189
+ _stats_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data", "protein_stats.csv")
190
+ try:
191
+ import pandas as _pd_st
192
+ df_stats = _pd_st.read_csv(_stats_path)
193
+ _has_stats = len(df_stats) > 100
194
+ except Exception:
195
+ df_stats = None
196
+ _has_stats = False
197
+
198
+ if _has_stats:
199
+ # ── Rich correlation heatmap using 7k merged proteins ──
200
+ _rich_cols = {
201
+ "rmsip_sq": "RMSIPΒ²",
202
+ "nsse_min": "NSSE",
203
+ "ref_len": "Seq Length",
204
+ "rmsd_mean": "RMSD mean",
205
+ "percent_id": "Seq %id",
206
+ "%var_1st": "%var mode0",
207
+ "cov": "Coverage",
208
+ "global_quality": "Cluster quality",
209
+ "nb_members": "Cluster size",
210
+ }
211
+ _rc = [c for c in _rich_cols if c in df_stats.columns and df_stats[c].notna().sum() > 50]
212
+ _rl = [_rich_cols[c] for c in _rc]
213
+ _cm = df_stats[_rc].dropna().corr().round(2)
214
 
215
  fig_h = go.Figure(data=go.Heatmap(
216
+ z=_cm.values, x=_rl, y=_rl,
 
217
  colorscale=[[0, "#1e1b4b"], [0.5, "#4338ca"], [1, "#ec4899"]],
218
+ text=_cm.values.round(2),
219
  texttemplate="%{text}",
220
+ textfont=dict(size=11, color="white"),
221
  hovertemplate="<b>%{x}</b> vs <b>%{y}</b><br>r = %{z:.3f}<extra></extra>",
222
  zmin=-1, zmax=1,
223
  colorbar=dict(title="r", tickfont=dict(color="#a5b4fc")),
224
  ))
225
+ fig_h.update_layout(**PLOT_LAYOUT, height=420,
226
+ title=f"Feature Correlation β€” {len(df_stats):,} proteins (structural stats + PETIMOT metrics)")
227
  st.plotly_chart(fig_h, use_container_width=True, key="heatmap")
228
 
229
+ # ── Success vs Failure analysis ──
230
+ st.markdown("#### 🎯 What separates PETIMOT successes from failures?")
231
+ st.caption("Success = RMSIPΒ² > 0.5 (PETIMOT outperforms NMA on directionality)")
232
+
233
+ _df_sv = df_stats.dropna(subset=["rmsip_sq", "ref_len"])
234
+ _df_sv = _df_sv.copy()
235
+ _df_sv["outcome"] = _df_sv["rmsip_sq"].apply(lambda x: "βœ… Success" if x > 0.5 else "❌ Failure")
236
+
237
+ _violin_features = [
238
+ ("ref_len", "Sequence Length (residues)"),
239
+ ("%var_1st", "% Variance in NMA mode 0"),
240
+ ("rmsd_mean", "Intra-cluster RMSD (Γ…)"),
241
+ ("percent_id", "Sequence identity"),
242
+ ]
243
+
244
+ _vcols = st.columns(2)
245
+ for i, (feat, label) in enumerate(_violin_features):
246
+ if feat not in _df_sv.columns: continue
247
+ fig_v = go.Figure()
248
+ for outcome, color in [("βœ… Success", "#10b981"), ("❌ Failure", "#ef4444")]:
249
+ vals = _df_sv[_df_sv["outcome"] == outcome][feat].dropna()
250
+ fig_v.add_trace(go.Violin(
251
+ y=vals, name=outcome,
252
+ line_color=color,
253
+ fillcolor={"#10b981": "rgba(16,185,129,0.2)", "#ef4444": "rgba(239,68,68,0.2)"}.get(color, "rgba(99,102,241,0.2)"),
254
+ box_visible=True, meanline_visible=True, showlegend=(i == 0),
255
+ ))
256
+ fig_v.update_layout(**PLOT_LAYOUT, height=280, title=label,
257
+ yaxis_title=label, margin=dict(l=40, r=10, t=45, b=30))
258
+ with _vcols[i % 2]:
259
+ st.plotly_chart(fig_v, use_container_width=True, key=f"violin_sv_{feat}")
260
+
261
+ # ── Top predictors (sorted abs correlation with rmsip_sq) ──
262
+ _target = "rmsip_sq"
263
+ _predictor_cols = [c for c in _rc if c != _target]
264
+ _predictor_corrs = {_rich_cols[c]: abs(_cm.loc[_target, c]) for c in _predictor_cols if _target in _cm.index}
265
+ _predictor_corrs = dict(sorted(_predictor_corrs.items(), key=lambda x: -x[1]))
266
+
267
+ fig_imp = go.Figure(go.Bar(
268
+ x=list(_predictor_corrs.values()),
269
+ y=list(_predictor_corrs.keys()),
270
+ orientation="h",
271
+ marker_color=["#6366f1" if v > 0.15 else "#4338ca" for v in _predictor_corrs.values()],
272
+ text=[f"{v:.3f}" for v in _predictor_corrs.values()],
273
+ textposition="outside",
274
+ ))
275
+ fig_imp.update_layout(**PLOT_LAYOUT, height=280,
276
+ title="Absolute correlation with RMSIPΒ² (feature importance proxy)",
277
+ xaxis=dict(title="|r|", range=[0, max(_predictor_corrs.values()) * 1.3]),
278
+ margin=dict(l=140, r=40, t=50, b=30))
279
+ st.plotly_chart(fig_imp, use_container_width=True, key="feat_imp")
280
+
281
+ else:
282
+ # Fallback: basic correlations from predictions only
283
+ _all_corr = {"seq_len": "Seq Length", "mean_disp_m0": "Mean Ξ”",
284
+ "max_disp_m0": "Max Ξ”"}
285
+ _rc = [c for c in _all_corr if c in df.columns and df[c].notna().sum() > 5]
286
+ _rl = [_all_corr[c] for c in _rc]
287
+ _cm = df[_rc].dropna().corr().fillna(0)
288
+ fig_h = go.Figure(data=go.Heatmap(
289
+ z=_cm.values, x=_rl, y=_rl,
290
+ colorscale=[[0, "#1e1b4b"], [0.5, "#4338ca"], [1, "#ec4899"]],
291
+ text=np.round(_cm.values, 2), texttemplate="%{text}",
292
+ zmin=-1, zmax=1,
293
+ ))
294
+ fig_h.update_layout(**PLOT_LAYOUT, height=300, title="Feature Correlation (predictions only)")
295
+ st.plotly_chart(fig_h, use_container_width=True, key="heatmap")
296
 
297
+ col_scatter_dummy = st.container()
298
+ with col_scatter_dummy:
299
+ fig_s = go.Figure()
300
  fig_s.add_trace(go.Histogram2dContour(
301
  x=df.seq_len, y=df.mean_disp_m0,
302
  colorscale=[[0, "rgba(30,27,75,0)"], [0.3, "rgba(99,102,241,0.3)"],
303
  [0.6, "rgba(139,92,246,0.5)"], [1, "rgba(236,72,153,0.7)"]],
304
+ ncontours=15, showscale=False, hoverinfo="skip",
 
 
305
  ))
 
306
  fig_s.add_trace(go.Scattergl(
307
+ x=df.seq_len, y=df.mean_disp_m0, mode="markers",
308
+ marker=dict(size=3, color=df.max_disp_m0, colorscale="Viridis", showscale=True,
309
+ colorbar=dict(title="Max Ξ” (Γ…)", tickfont=dict(color="#a5b4fc")), opacity=0.6),
 
 
 
 
 
310
  text=df.name,
311
  hovertemplate="<b>%{text}</b><br>Length: %{x}<br>Mean Ξ”: %{y:.3f} Γ…<extra></extra>",
312
  ))
313
+ fig_s.update_layout(**PLOT_LAYOUT, height=350, showlegend=False,
314
+ title="Sequence Length vs Mean Displacement (36k proteins)",
315
+ xaxis_title="Sequence Length", yaxis_title="Mean Displacement (Γ…)")
 
 
 
316
  st.plotly_chart(fig_s, use_container_width=True, key="scatter")
317
 
318
 
319
+
320
  # ═══════════════════════════════════════
321
  # SECTION 3: Top-10 Leaderboards
322
  # ═══════════════════════════════════════
app/pages/5_πŸ”¬_Protein_Detail.py CHANGED
@@ -7,6 +7,7 @@ import plotly.graph_objects as go
7
  import plotly.express as px
8
  from plotly.subplots import make_subplots
9
  from pathlib import Path
 
10
 
11
  from app.utils.data_loader import find_predictions_dir, load_modes, load_ground_truth, PETIMOT_ROOT
12
 
@@ -121,15 +122,27 @@ with st.expander(f"πŸ“¦ Cluster info (n={int(row['nb_members'])} members)", expa
121
  margin=dict(l=40,r=20,t=40,b=30))
122
  st.plotly_chart(fig_q, use_container_width=True)
123
 
124
- # ── Mode displacement ─────────────────────────────────────────────
125
- st.divider()
126
- st.markdown("### 🌊 Predicted Normal Modes (PETIMOT)")
127
  pred_dir = find_predictions_dir(PETIMOT_ROOT)
128
  modes = load_modes(pred_dir, selected_key) if pred_dir else {}
129
 
130
  gt_dir = os.path.join(PETIMOT_ROOT, "ground_truth")
131
  gt = load_ground_truth(gt_dir, selected_key)
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  if modes:
134
  n_modes = len(modes)
135
  mode_tabs = st.tabs([f"Mode {k}" for k in sorted(modes.keys())])
 
7
  import plotly.express as px
8
  from plotly.subplots import make_subplots
9
  from pathlib import Path
10
+ from app.utils.bio_api import get_sequence_from_pdb, render_sequence_aa, get_protein_mutations
11
 
12
  from app.utils.data_loader import find_predictions_dir, load_modes, load_ground_truth, PETIMOT_ROOT
13
 
 
122
  margin=dict(l=40,r=20,t=40,b=30))
123
  st.plotly_chart(fig_q, use_container_width=True)
124
 
125
+ # ── Amino acid sequence strip ─────────────────────────────────────────
 
 
126
  pred_dir = find_predictions_dir(PETIMOT_ROOT)
127
  modes = load_modes(pred_dir, selected_key) if pred_dir else {}
128
 
129
  gt_dir = os.path.join(PETIMOT_ROOT, "ground_truth")
130
  gt = load_ground_truth(gt_dir, selected_key)
131
 
132
+ with st.spinner("Fetching sequence from RCSB..."):
133
+ seq = get_sequence_from_pdb(selected_key)
134
+ _mode0_vecs = list(modes.values())[0] if modes else None
135
+ _mode0_mags = (np.linalg.norm(_mode0_vecs, axis=1) if _mode0_vecs is not None and _mode0_vecs.ndim > 1
136
+ else (np.abs(_mode0_vecs) if _mode0_vecs is not None else None))
137
+ _muts = get_protein_mutations(selected_key, int(row["ref_len"]))
138
+ render_sequence_aa(seq or "", displacements=_mode0_mags, mutations=_muts,
139
+ title=f"πŸ”€ {selected_key} β€” AA sequence (opacity = mode 0 displacement)")
140
+
141
+ # ── Mode displacement ─────────────────────────────────────────────
142
+ st.divider()
143
+ st.markdown("### 🌊 Predicted Normal Modes (PETIMOT)")
144
+
145
+
146
  if modes:
147
  n_modes = len(modes)
148
  mode_tabs = st.tabs([f"Mode {k}" for k in sorted(modes.keys())])
app/utils/bio_api.py CHANGED
@@ -1,4 +1,4 @@
1
- """External biology APIs (EBI, UniProt) for mutation and sequence tracking."""
2
  import requests
3
  import numpy as np
4
  import streamlit as st
@@ -6,60 +6,184 @@ import logging
6
 
7
  logger = logging.getLogger(__name__)
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  @st.cache_data(ttl=86400, show_spinner=False)
10
  def get_uniprot_id_from_pdb(pdb_id: str) -> str | None:
11
  """Map a 4-letter PDB ID to its primary UniProt accession using PDBe API."""
12
  pdb_id = pdb_id[:4].lower()
13
- url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
14
  try:
15
- r = requests.get(url, timeout=10)
16
- if not r.ok:
17
- return None
18
- data = r.json()
19
- if pdb_id in data and "UniProt" in data[pdb_id]:
20
- # Just take the first UniProt accession mapped
21
- return list(data[pdb_id]["UniProt"].keys())[0]
22
  except Exception as e:
23
  logger.warning(f"PDBe Mapping failed for {pdb_id}: {e}")
24
  return None
25
 
 
26
  @st.cache_data(ttl=86400, show_spinner="Fetching evolutionary mutations...")
27
- def fetch_mutation_frequency(uniprot_id: str, seq_length: int) -> np.ndarray | None:
28
- """Fetch known natural variants from EBI and return frequency per residue."""
29
- url = f"https://www.ebi.ac.uk/proteins/api/variation/{uniprot_id}"
30
  try:
31
- r = requests.get(url, headers={"Accept": "application/json"}, timeout=15)
 
 
32
  if not r.ok:
33
  return None
34
-
35
- data = r.json()
36
- features = data.get("features", [])
37
-
38
- # Array to store mutation counts per position
39
  freqs = np.zeros(seq_length)
40
-
41
- for f in features:
42
  if f.get("type") == "VARIANT":
43
  try:
44
- begin = int(f.get("begin", -1))
45
- # 1-indexed to 0-indexed
46
- if 1 <= begin <= seq_length:
47
- freqs[begin - 1] += 1
48
  except ValueError:
49
  continue
50
-
51
  return freqs
52
  except Exception as e:
53
  logger.warning(f"Variation API failed for {uniprot_id}: {e}")
54
  return None
55
 
 
56
  @st.cache_data(ttl=86400, show_spinner=False)
57
- def get_protein_mutations(protein_name: str, seq_length: int) -> np.ndarray | None:
58
- """End-to-end: PDB Name -> UniProt -> Mutation Frequencies."""
59
- # Assuming protein_name format corresponds to a PDB ID in its first 4 chars
60
  if len(protein_name) >= 4:
61
- pdb_id = protein_name[:4]
62
- uniprot_id = get_uniprot_id_from_pdb(pdb_id)
63
- if uniprot_id:
64
- return fetch_mutation_frequency(uniprot_id, seq_length)
65
  return None
 
1
+ """External biology APIs (EBI, UniProt, RCSB) for mutation and sequence tracking."""
2
  import requests
3
  import numpy as np
4
  import streamlit as st
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
9
+ # ── Amino acid property colours ──────────────────────────────────────
10
+ AA_COLORS = {
11
+ # Hydrophobic
12
+ "A": "#7c3aed", "V": "#7c3aed", "I": "#6d28d9", "L": "#6d28d9",
13
+ "M": "#7c3aed", "F": "#5b21b6", "W": "#4c1d95", "P": "#8b5cf6",
14
+ # Charged positive
15
+ "K": "#0891b2", "R": "#0e7490", "H": "#06b6d4",
16
+ # Charged negative
17
+ "D": "#e11d48", "E": "#be123c",
18
+ # Polar
19
+ "S": "#0d9488", "T": "#0f766e", "N": "#115e59", "Q": "#134e4a",
20
+ # Special
21
+ "C": "#d97706", "G": "#b45309", "Y": "#92400e",
22
+ # Unknown
23
+ "X": "#475569",
24
+ }
25
+
26
+ AA_LABELS = {
27
+ "A": "Ala", "V": "Val", "I": "Ile", "L": "Leu", "M": "Met",
28
+ "F": "Phe", "W": "Trp", "P": "Pro", "K": "Lys", "R": "Arg",
29
+ "H": "His", "D": "Asp", "E": "Glu", "S": "Ser", "T": "Thr",
30
+ "N": "Asn", "Q": "Gln", "C": "Cys", "G": "Gly", "Y": "Tyr", "X": "Unk",
31
+ }
32
+
33
+
34
+ # ── Sequence fetching ─────────────────────────────────────────────────
35
+ @st.cache_data(ttl=86400, show_spinner=False)
36
+ def get_sequence_from_pdb(protein_name: str) -> str | None:
37
+ """Fetch amino acid sequence from RCSB for a protein name like '1HO5A'."""
38
+ if len(protein_name) < 4:
39
+ return None
40
+ pdb_id = protein_name[:4].upper()
41
+ chain = protein_name[4].upper() if len(protein_name) >= 5 else "A"
42
+
43
+ # Try RCSB REST: entity instance β†’ entity β†’ sequence
44
+ try:
45
+ r = requests.get(
46
+ f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain}",
47
+ timeout=10)
48
+ if r.ok:
49
+ entity_id = (r.json()
50
+ .get("rcsb_polymer_entity_instance_container_identifiers", {})
51
+ .get("entity_id"))
52
+ if entity_id:
53
+ r2 = requests.get(
54
+ f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}",
55
+ timeout=10)
56
+ if r2.ok:
57
+ seq = (r2.json()
58
+ .get("entity_poly", {})
59
+ .get("pdbx_seq_one_letter_code_can", ""))
60
+ seq = seq.replace("\n", "").strip()
61
+ if seq:
62
+ return seq
63
+ except Exception as e:
64
+ logger.warning(f"RCSB entity fetch failed for {protein_name}: {e}")
65
+
66
+ # Fallback: FASTA endpoint, pick the right chain
67
+ try:
68
+ r3 = requests.get(f"https://www.rcsb.org/fasta/entry/{pdb_id}/download", timeout=10)
69
+ if r3.ok:
70
+ seq, capture = "", False
71
+ for line in r3.text.strip().split("\n"):
72
+ if line.startswith(">"):
73
+ capture = f"|Chain {chain}|" in line or f"Chain {chain}" in line
74
+ elif capture:
75
+ seq += line.strip()
76
+ if seq:
77
+ return seq
78
+ except Exception as e:
79
+ logger.warning(f"RCSB FASTA fetch failed for {protein_name}: {e}")
80
+ return None
81
+
82
+
83
+ # ── Coloured AA sequence renderer ────────────────────────────────────
84
+ def render_sequence_aa(
85
+ sequence: str,
86
+ displacements: "np.ndarray | None" = None,
87
+ mutations: "np.ndarray | None" = None,
88
+ title: str = "Amino Acid Sequence",
89
+ ) -> None:
90
+ """
91
+ Render a coloured amino-acid strip in Streamlit.
92
+ - Tile colour = AA physicochemical property
93
+ - Opacity = predicted displacement magnitude (if provided)
94
+ - Red border = known mutation site (if provided)
95
+ """
96
+ if not sequence:
97
+ st.info("Sequence not available β€” fetching from RCSB failed.")
98
+ return
99
+
100
+ n = len(sequence)
101
+ max_disp = float(np.max(displacements)) if displacements is not None and len(displacements) > 0 else 1.0
102
+
103
+ st.markdown(f"**{title}** β€” {n} residues")
104
+ st.markdown("""
105
+ <style>
106
+ .seq-strip{display:flex;flex-wrap:wrap;gap:2px;margin-bottom:8px;}
107
+ .aa-tile{width:22px;height:22px;border-radius:4px;display:flex;align-items:center;
108
+ justify-content:center;font-size:10px;font-weight:700;color:white;cursor:default;
109
+ border:2px solid transparent;transition:transform .1s;}
110
+ .aa-tile:hover{transform:scale(1.35);z-index:10;}
111
+ .aa-mut{border:2px solid #f43f5e!important;}
112
+ </style>""", unsafe_allow_html=True)
113
+
114
+ tiles = []
115
+ for i, aa in enumerate(sequence):
116
+ color = AA_COLORS.get(aa, "#475569")
117
+ h = color.lstrip("#")
118
+ r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
119
+ alpha = (0.35 + 0.65 * float(displacements[i]) / (max_disp + 1e-8)
120
+ if displacements is not None and i < len(displacements) else 0.85)
121
+ bg = f"rgba({r},{g},{b},{alpha:.2f})"
122
+ mut_cls = " aa-mut" if (mutations is not None and i < len(mutations) and mutations[i] > 0) else ""
123
+ tip = f"{AA_LABELS.get(aa,aa)}{i+1}"
124
+ if displacements is not None and i < len(displacements):
125
+ tip += f" Ξ”={displacements[i]:.2f}Γ…"
126
+ if mutations is not None and i < len(mutations) and mutations[i] > 0:
127
+ tip += f" [{int(mutations[i])} variant(s)]"
128
+ tiles.append(f'<div class="aa-tile{mut_cls}" style="background:{bg}" title="{tip}">{aa}</div>')
129
+
130
+ st.markdown(f'<div class="seq-strip">{"".join(tiles)}</div>', unsafe_allow_html=True)
131
+ st.markdown("""
132
+ <div style="display:flex;gap:12px;flex-wrap:wrap;font-size:11px;color:#94a3b8;margin-top:2px;">
133
+ <span><span style="background:#7c3aed;padding:1px 5px;border-radius:3px;color:white">β– </span> Hydrophobic</span>
134
+ <span><span style="background:#0891b2;padding:1px 5px;border-radius:3px;color:white">β– </span> (+) charged</span>
135
+ <span><span style="background:#e11d48;padding:1px 5px;border-radius:3px;color:white">β– </span> (βˆ’) charged</span>
136
+ <span><span style="background:#0d9488;padding:1px 5px;border-radius:3px;color:white">β– </span> Polar</span>
137
+ <span><span style="background:#d97706;padding:1px 5px;border-radius:3px;color:white">β– </span> Special</span>
138
+ <span style="color:#f43f5e;">πŸ”΄ border = mutation site Β· opacity = predicted Ξ”</span>
139
+ </div>""", unsafe_allow_html=True)
140
+
141
+
142
+ # ── UniProt / EBI mutation fetching ──────────────────────────────────
143
  @st.cache_data(ttl=86400, show_spinner=False)
144
  def get_uniprot_id_from_pdb(pdb_id: str) -> str | None:
145
  """Map a 4-letter PDB ID to its primary UniProt accession using PDBe API."""
146
  pdb_id = pdb_id[:4].lower()
 
147
  try:
148
+ r = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}", timeout=10)
149
+ if r.ok:
150
+ data = r.json()
151
+ if pdb_id in data and "UniProt" in data[pdb_id]:
152
+ return list(data[pdb_id]["UniProt"].keys())[0]
 
 
153
  except Exception as e:
154
  logger.warning(f"PDBe Mapping failed for {pdb_id}: {e}")
155
  return None
156
 
157
+
158
  @st.cache_data(ttl=86400, show_spinner="Fetching evolutionary mutations...")
159
+ def fetch_mutation_frequency(uniprot_id: str, seq_length: int) -> "np.ndarray | None":
160
+ """Fetch known natural variants from EBI and return count per residue."""
 
161
  try:
162
+ r = requests.get(
163
+ f"https://www.ebi.ac.uk/proteins/api/variation/{uniprot_id}",
164
+ headers={"Accept": "application/json"}, timeout=15)
165
  if not r.ok:
166
  return None
 
 
 
 
 
167
  freqs = np.zeros(seq_length)
168
+ for f in r.json().get("features", []):
 
169
  if f.get("type") == "VARIANT":
170
  try:
171
+ pos = int(f.get("begin", -1))
172
+ if 1 <= pos <= seq_length:
173
+ freqs[pos - 1] += 1
 
174
  except ValueError:
175
  continue
 
176
  return freqs
177
  except Exception as e:
178
  logger.warning(f"Variation API failed for {uniprot_id}: {e}")
179
  return None
180
 
181
+
182
  @st.cache_data(ttl=86400, show_spinner=False)
183
+ def get_protein_mutations(protein_name: str, seq_length: int) -> "np.ndarray | None":
184
+ """End-to-end: PDB Name β†’ UniProt β†’ Mutation Frequencies."""
 
185
  if len(protein_name) >= 4:
186
+ uid = get_uniprot_id_from_pdb(protein_name[:4])
187
+ if uid:
188
+ return fetch_mutation_frequency(uid, seq_length)
 
189
  return None