"""External biology APIs (EBI, UniProt, RCSB) for mutation and sequence tracking."""
import requests
import numpy as np
import streamlit as st
import logging
logger = logging.getLogger(__name__)
# ── Amino acid property colours ──────────────────────────────────────
AA_COLORS = {
# Hydrophobic
"A": "#7c3aed", "V": "#7c3aed", "I": "#6d28d9", "L": "#6d28d9",
"M": "#7c3aed", "F": "#5b21b6", "W": "#4c1d95", "P": "#8b5cf6",
# Charged positive
"K": "#0891b2", "R": "#0e7490", "H": "#06b6d4",
# Charged negative
"D": "#e11d48", "E": "#be123c",
# Polar
"S": "#0d9488", "T": "#0f766e", "N": "#115e59", "Q": "#134e4a",
# Special
"C": "#d97706", "G": "#b45309", "Y": "#92400e",
# Unknown
"X": "#475569",
}
AA_LABELS = {
"A": "Ala", "V": "Val", "I": "Ile", "L": "Leu", "M": "Met",
"F": "Phe", "W": "Trp", "P": "Pro", "K": "Lys", "R": "Arg",
"H": "His", "D": "Asp", "E": "Glu", "S": "Ser", "T": "Thr",
"N": "Asn", "Q": "Gln", "C": "Cys", "G": "Gly", "Y": "Tyr", "X": "Unk",
}
# ── Sequence fetching ─────────────────────────────────────────────────
@st.cache_data(ttl=86400, show_spinner=False)
def get_sequence_from_pdb(protein_name: str) -> str | None:
"""Fetch amino acid sequence from RCSB for a protein name like '1HO5A'."""
if len(protein_name) < 4:
return None
pdb_id = protein_name[:4].upper()
chain = protein_name[4].upper() if len(protein_name) >= 5 else "A"
# Try RCSB REST: entity instance → entity → sequence
try:
r = requests.get(
f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain}",
timeout=10)
if r.ok:
entity_id = (r.json()
.get("rcsb_polymer_entity_instance_container_identifiers", {})
.get("entity_id"))
if entity_id:
r2 = requests.get(
f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}",
timeout=10)
if r2.ok:
seq = (r2.json()
.get("entity_poly", {})
.get("pdbx_seq_one_letter_code_can", ""))
seq = seq.replace("\n", "").strip()
if seq:
return seq
except Exception as e:
logger.warning(f"RCSB entity fetch failed for {protein_name}: {e}")
# Fallback: FASTA endpoint, pick the right chain
try:
r3 = requests.get(f"https://www.rcsb.org/fasta/entry/{pdb_id}/download", timeout=10)
if r3.ok:
seq, capture = "", False
for line in r3.text.strip().split("\n"):
if line.startswith(">"):
capture = f"|Chain {chain}|" in line or f"Chain {chain}" in line
elif capture:
seq += line.strip()
if seq:
return seq
except Exception as e:
logger.warning(f"RCSB FASTA fetch failed for {protein_name}: {e}")
return None
# ── Coloured AA sequence renderer ────────────────────────────────────
def render_sequence_aa(
sequence: str,
displacements: "np.ndarray | None" = None,
mutations: "np.ndarray | None" = None,
title: str = "Amino Acid Sequence",
) -> None:
"""
Render a coloured amino-acid strip in Streamlit.
- Tile colour = AA physicochemical property
- Opacity = predicted displacement magnitude (if provided)
- Red border = known mutation site (if provided)
"""
if not sequence:
st.info("Sequence not available — fetching from RCSB failed.")
return
n = len(sequence)
max_disp = float(np.max(displacements)) if displacements is not None and len(displacements) > 0 else 1.0
st.markdown(f"**{title}** — {n} residues")
st.markdown("""
""", unsafe_allow_html=True)
tiles = []
for i, aa in enumerate(sequence):
color = AA_COLORS.get(aa, "#475569")
h = color.lstrip("#")
r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
alpha = (0.35 + 0.65 * float(displacements[i]) / (max_disp + 1e-8)
if displacements is not None and i < len(displacements) else 0.85)
bg = f"rgba({r},{g},{b},{alpha:.2f})"
mut_cls = " aa-mut" if (mutations is not None and i < len(mutations) and mutations[i] > 0) else ""
tip = f"{AA_LABELS.get(aa,aa)}{i+1}"
if displacements is not None and i < len(displacements):
tip += f" Δ={displacements[i]:.2f}Å"
if mutations is not None and i < len(mutations) and mutations[i] > 0:
tip += f" [{int(mutations[i])} variant(s)]"
tiles.append(f'
{aa}
')
st.markdown(f'{"".join(tiles)}
', unsafe_allow_html=True)
st.markdown("""
■ Hydrophobic
■ (+) charged
■ (−) charged
■ Polar
■ Special
🔴 border = mutation site · opacity = predicted Δ
""", unsafe_allow_html=True)
# ── UniProt / EBI mutation fetching ──────────────────────────────────
@st.cache_data(ttl=86400, show_spinner=False)
def get_uniprot_id_from_pdb(pdb_id: str) -> str | None:
"""Map a 4-letter PDB ID to its primary UniProt accession using PDBe API."""
pdb_id = pdb_id[:4].lower()
try:
r = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}", timeout=10)
if r.ok:
data = r.json()
if pdb_id in data and "UniProt" in data[pdb_id]:
return list(data[pdb_id]["UniProt"].keys())[0]
except Exception as e:
logger.warning(f"PDBe Mapping failed for {pdb_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner="Fetching evolutionary mutations...")
def fetch_mutation_frequency(uniprot_id: str, seq_length: int) -> "np.ndarray | None":
"""Fetch known natural variants from EBI and return count per residue."""
try:
r = requests.get(
f"https://www.ebi.ac.uk/proteins/api/variation/{uniprot_id}",
headers={"Accept": "application/json"}, timeout=15)
if not r.ok:
return None
freqs = np.zeros(seq_length)
for f in r.json().get("features", []):
if f.get("type") == "VARIANT":
try:
pos = int(f.get("begin", -1))
if 1 <= pos <= seq_length:
freqs[pos - 1] += 1
except ValueError:
continue
return freqs
except Exception as e:
logger.warning(f"Variation API failed for {uniprot_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner=False)
def get_protein_mutations(protein_name: str, seq_length: int) -> "np.ndarray | None":
"""End-to-end: PDB Name → UniProt → Mutation Frequencies."""
if len(protein_name) >= 4:
uid = get_uniprot_id_from_pdb(protein_name[:4])
if uid:
return fetch_mutation_frequency(uid, seq_length)
return None
# ── PDB structure fetching (for 3D viewer & ATLAS pairs) ─────────────
@st.cache_data(ttl=86400, show_spinner=False)
def fetch_pdb_structure(protein_name: str) -> "str | None":
"""
Download PDB text for a protein like '1A3RH' from RCSB.
Filters to the correct chain. Returns PDB text string or None.
"""
if len(protein_name) < 4:
return None
pdb_id = protein_name[:4].upper()
chain = protein_name[4].upper() if len(protein_name) >= 5 else None
try:
r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb", timeout=20)
if not r.ok:
return None
pdb_text = r.text
if chain:
lines = [l for l in pdb_text.split("\n")
if (l.startswith("ATOM") and len(l) > 21 and l[21] == chain)
or l.startswith(("HEADER", "TITLE", "REMARK", "END"))]
pdb_text = "\n".join(lines)
return pdb_text
except Exception as e:
logger.warning(f"PDB download failed for {pdb_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner=False)
def fetch_atlas_pair_info(pair_name: str) -> dict:
"""
Parse an ATLAS-style pair '1A3RH_5W23J' into two conformations
and fetch metadata from RCSB for each.
Returns dict with query/target PDB IDs, chains, titles, and resolution.
"""
parts = pair_name.split("_")
if len(parts) < 2:
return {}
query_name, target_name = parts[0], parts[1]
query_pdb, query_chain = query_name[:4].upper(), (query_name[4].upper() if len(query_name) >= 5 else "A")
target_pdb, target_chain = target_name[:4].upper(), (target_name[4].upper() if len(target_name) >= 5 else "A")
result = {
"pair": pair_name,
"query": {"pdb": query_pdb, "chain": query_chain, "name": query_name},
"target": {"pdb": target_pdb, "chain": target_chain, "name": target_name},
}
for key, pdb_id in [("query", query_pdb), ("target", target_pdb)]:
try:
r = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}", timeout=8)
if r.ok:
d = r.json()
result[key]["title"] = d.get("struct", {}).get("title", "")[:60]
result[key]["resolution"] = (d.get("rcsb_entry_info", {})
.get("resolution_combined", [None]) or [None])[0]
result[key]["method"] = d.get("rcsb_entry_info", {}).get("experimental_method", "")
result[key]["year"] = d.get("rcsb_accession_info", {}).get("deposit_date", "")[:4]
except Exception:
pass
return result