File size: 11,172 Bytes
2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 2d07a5e 118eb7a 7cff683 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | """External biology APIs (EBI, UniProt, RCSB) for mutation and sequence tracking."""
import requests
import numpy as np
import streamlit as st
import logging
logger = logging.getLogger(__name__)
# ββ Amino acid property colours ββββββββββββββββββββββββββββββββββββββ
AA_COLORS = {
# Hydrophobic
"A": "#7c3aed", "V": "#7c3aed", "I": "#6d28d9", "L": "#6d28d9",
"M": "#7c3aed", "F": "#5b21b6", "W": "#4c1d95", "P": "#8b5cf6",
# Charged positive
"K": "#0891b2", "R": "#0e7490", "H": "#06b6d4",
# Charged negative
"D": "#e11d48", "E": "#be123c",
# Polar
"S": "#0d9488", "T": "#0f766e", "N": "#115e59", "Q": "#134e4a",
# Special
"C": "#d97706", "G": "#b45309", "Y": "#92400e",
# Unknown
"X": "#475569",
}
AA_LABELS = {
"A": "Ala", "V": "Val", "I": "Ile", "L": "Leu", "M": "Met",
"F": "Phe", "W": "Trp", "P": "Pro", "K": "Lys", "R": "Arg",
"H": "His", "D": "Asp", "E": "Glu", "S": "Ser", "T": "Thr",
"N": "Asn", "Q": "Gln", "C": "Cys", "G": "Gly", "Y": "Tyr", "X": "Unk",
}
# ββ Sequence fetching βββββββββββββββββββββββββββββββββββββββββββββββββ
@st.cache_data(ttl=86400, show_spinner=False)
def get_sequence_from_pdb(protein_name: str) -> str | None:
"""Fetch amino acid sequence from RCSB for a protein name like '1HO5A'."""
if len(protein_name) < 4:
return None
pdb_id = protein_name[:4].upper()
chain = protein_name[4].upper() if len(protein_name) >= 5 else "A"
# Try RCSB REST: entity instance β entity β sequence
try:
r = requests.get(
f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain}",
timeout=10)
if r.ok:
entity_id = (r.json()
.get("rcsb_polymer_entity_instance_container_identifiers", {})
.get("entity_id"))
if entity_id:
r2 = requests.get(
f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}",
timeout=10)
if r2.ok:
seq = (r2.json()
.get("entity_poly", {})
.get("pdbx_seq_one_letter_code_can", ""))
seq = seq.replace("\n", "").strip()
if seq:
return seq
except Exception as e:
logger.warning(f"RCSB entity fetch failed for {protein_name}: {e}")
# Fallback: FASTA endpoint, pick the right chain
try:
r3 = requests.get(f"https://www.rcsb.org/fasta/entry/{pdb_id}/download", timeout=10)
if r3.ok:
seq, capture = "", False
for line in r3.text.strip().split("\n"):
if line.startswith(">"):
capture = f"|Chain {chain}|" in line or f"Chain {chain}" in line
elif capture:
seq += line.strip()
if seq:
return seq
except Exception as e:
logger.warning(f"RCSB FASTA fetch failed for {protein_name}: {e}")
return None
# ββ Coloured AA sequence renderer ββββββββββββββββββββββββββββββββββββ
def render_sequence_aa(
sequence: str,
displacements: "np.ndarray | None" = None,
mutations: "np.ndarray | None" = None,
title: str = "Amino Acid Sequence",
) -> None:
"""
Render a coloured amino-acid strip in Streamlit.
- Tile colour = AA physicochemical property
- Opacity = predicted displacement magnitude (if provided)
- Red border = known mutation site (if provided)
"""
if not sequence:
st.info("Sequence not available β fetching from RCSB failed.")
return
n = len(sequence)
max_disp = float(np.max(displacements)) if displacements is not None and len(displacements) > 0 else 1.0
st.markdown(f"**{title}** β {n} residues")
st.markdown("""
<style>
.seq-strip{display:flex;flex-wrap:wrap;gap:2px;margin-bottom:8px;}
.aa-tile{width:22px;height:22px;border-radius:4px;display:flex;align-items:center;
justify-content:center;font-size:10px;font-weight:700;color:white;cursor:default;
border:2px solid transparent;transition:transform .1s;}
.aa-tile:hover{transform:scale(1.35);z-index:10;}
.aa-mut{border:2px solid #f43f5e!important;}
</style>""", unsafe_allow_html=True)
tiles = []
for i, aa in enumerate(sequence):
color = AA_COLORS.get(aa, "#475569")
h = color.lstrip("#")
r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
alpha = (0.35 + 0.65 * float(displacements[i]) / (max_disp + 1e-8)
if displacements is not None and i < len(displacements) else 0.85)
bg = f"rgba({r},{g},{b},{alpha:.2f})"
mut_cls = " aa-mut" if (mutations is not None and i < len(mutations) and mutations[i] > 0) else ""
tip = f"{AA_LABELS.get(aa,aa)}{i+1}"
if displacements is not None and i < len(displacements):
tip += f" Ξ={displacements[i]:.2f}Γ
"
if mutations is not None and i < len(mutations) and mutations[i] > 0:
tip += f" [{int(mutations[i])} variant(s)]"
tiles.append(f'<div class="aa-tile{mut_cls}" style="background:{bg}" title="{tip}">{aa}</div>')
st.markdown(f'<div class="seq-strip">{"".join(tiles)}</div>', unsafe_allow_html=True)
st.markdown("""
<div style="display:flex;gap:12px;flex-wrap:wrap;font-size:11px;color:#94a3b8;margin-top:2px;">
<span><span style="background:#7c3aed;padding:1px 5px;border-radius:3px;color:white">β </span> Hydrophobic</span>
<span><span style="background:#0891b2;padding:1px 5px;border-radius:3px;color:white">β </span> (+) charged</span>
<span><span style="background:#e11d48;padding:1px 5px;border-radius:3px;color:white">β </span> (β) charged</span>
<span><span style="background:#0d9488;padding:1px 5px;border-radius:3px;color:white">β </span> Polar</span>
<span><span style="background:#d97706;padding:1px 5px;border-radius:3px;color:white">β </span> Special</span>
<span style="color:#f43f5e;">π΄ border = mutation site Β· opacity = predicted Ξ</span>
</div>""", unsafe_allow_html=True)
# ββ UniProt / EBI mutation fetching ββββββββββββββββββββββββββββββββββ
@st.cache_data(ttl=86400, show_spinner=False)
def get_uniprot_id_from_pdb(pdb_id: str) -> str | None:
"""Map a 4-letter PDB ID to its primary UniProt accession using PDBe API."""
pdb_id = pdb_id[:4].lower()
try:
r = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}", timeout=10)
if r.ok:
data = r.json()
if pdb_id in data and "UniProt" in data[pdb_id]:
return list(data[pdb_id]["UniProt"].keys())[0]
except Exception as e:
logger.warning(f"PDBe Mapping failed for {pdb_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner="Fetching evolutionary mutations...")
def fetch_mutation_frequency(uniprot_id: str, seq_length: int) -> "np.ndarray | None":
"""Fetch known natural variants from EBI and return count per residue."""
try:
r = requests.get(
f"https://www.ebi.ac.uk/proteins/api/variation/{uniprot_id}",
headers={"Accept": "application/json"}, timeout=15)
if not r.ok:
return None
freqs = np.zeros(seq_length)
for f in r.json().get("features", []):
if f.get("type") == "VARIANT":
try:
pos = int(f.get("begin", -1))
if 1 <= pos <= seq_length:
freqs[pos - 1] += 1
except ValueError:
continue
return freqs
except Exception as e:
logger.warning(f"Variation API failed for {uniprot_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner=False)
def get_protein_mutations(protein_name: str, seq_length: int) -> "np.ndarray | None":
"""End-to-end: PDB Name β UniProt β Mutation Frequencies."""
if len(protein_name) >= 4:
uid = get_uniprot_id_from_pdb(protein_name[:4])
if uid:
return fetch_mutation_frequency(uid, seq_length)
return None
# ββ PDB structure fetching (for 3D viewer & ATLAS pairs) βββββββββββββ
@st.cache_data(ttl=86400, show_spinner=False)
def fetch_pdb_structure(protein_name: str) -> "str | None":
"""
Download PDB text for a protein like '1A3RH' from RCSB.
Filters to the correct chain. Returns PDB text string or None.
"""
if len(protein_name) < 4:
return None
pdb_id = protein_name[:4].upper()
chain = protein_name[4].upper() if len(protein_name) >= 5 else None
try:
r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb", timeout=20)
if not r.ok:
return None
pdb_text = r.text
if chain:
lines = [l for l in pdb_text.split("\n")
if (l.startswith("ATOM") and len(l) > 21 and l[21] == chain)
or l.startswith(("HEADER", "TITLE", "REMARK", "END"))]
pdb_text = "\n".join(lines)
return pdb_text
except Exception as e:
logger.warning(f"PDB download failed for {pdb_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner=False)
def fetch_atlas_pair_info(pair_name: str) -> dict:
"""
Parse an ATLAS-style pair '1A3RH_5W23J' into two conformations
and fetch metadata from RCSB for each.
Returns dict with query/target PDB IDs, chains, titles, and resolution.
"""
parts = pair_name.split("_")
if len(parts) < 2:
return {}
query_name, target_name = parts[0], parts[1]
query_pdb, query_chain = query_name[:4].upper(), (query_name[4].upper() if len(query_name) >= 5 else "A")
target_pdb, target_chain = target_name[:4].upper(), (target_name[4].upper() if len(target_name) >= 5 else "A")
result = {
"pair": pair_name,
"query": {"pdb": query_pdb, "chain": query_chain, "name": query_name},
"target": {"pdb": target_pdb, "chain": target_chain, "name": target_name},
}
for key, pdb_id in [("query", query_pdb), ("target", target_pdb)]:
try:
r = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}", timeout=8)
if r.ok:
d = r.json()
result[key]["title"] = d.get("struct", {}).get("title", "")[:60]
result[key]["resolution"] = (d.get("rcsb_entry_info", {})
.get("resolution_combined", [None]) or [None])[0]
result[key]["method"] = d.get("rcsb_entry_info", {}).get("experimental_method", "")
result[key]["year"] = d.get("rcsb_accession_info", {}).get("deposit_date", "")[:4]
except Exception:
pass
return result
|