Petimot / app /utils /bio_api.py
Valmbd's picture
ATLAS integration: RCSB on-demand PDB + pair metadata (query/target conformations)
7cff683 verified
"""External biology APIs (EBI, UniProt, RCSB) for mutation and sequence tracking."""
import requests
import numpy as np
import streamlit as st
import logging
logger = logging.getLogger(__name__)
# ── Amino acid property colours ──────────────────────────────────────
AA_COLORS = {
# Hydrophobic
"A": "#7c3aed", "V": "#7c3aed", "I": "#6d28d9", "L": "#6d28d9",
"M": "#7c3aed", "F": "#5b21b6", "W": "#4c1d95", "P": "#8b5cf6",
# Charged positive
"K": "#0891b2", "R": "#0e7490", "H": "#06b6d4",
# Charged negative
"D": "#e11d48", "E": "#be123c",
# Polar
"S": "#0d9488", "T": "#0f766e", "N": "#115e59", "Q": "#134e4a",
# Special
"C": "#d97706", "G": "#b45309", "Y": "#92400e",
# Unknown
"X": "#475569",
}
AA_LABELS = {
"A": "Ala", "V": "Val", "I": "Ile", "L": "Leu", "M": "Met",
"F": "Phe", "W": "Trp", "P": "Pro", "K": "Lys", "R": "Arg",
"H": "His", "D": "Asp", "E": "Glu", "S": "Ser", "T": "Thr",
"N": "Asn", "Q": "Gln", "C": "Cys", "G": "Gly", "Y": "Tyr", "X": "Unk",
}
# ── Sequence fetching ─────────────────────────────────────────────────
@st.cache_data(ttl=86400, show_spinner=False)
def get_sequence_from_pdb(protein_name: str) -> str | None:
"""Fetch amino acid sequence from RCSB for a protein name like '1HO5A'."""
if len(protein_name) < 4:
return None
pdb_id = protein_name[:4].upper()
chain = protein_name[4].upper() if len(protein_name) >= 5 else "A"
# Try RCSB REST: entity instance β†’ entity β†’ sequence
try:
r = requests.get(
f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain}",
timeout=10)
if r.ok:
entity_id = (r.json()
.get("rcsb_polymer_entity_instance_container_identifiers", {})
.get("entity_id"))
if entity_id:
r2 = requests.get(
f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}",
timeout=10)
if r2.ok:
seq = (r2.json()
.get("entity_poly", {})
.get("pdbx_seq_one_letter_code_can", ""))
seq = seq.replace("\n", "").strip()
if seq:
return seq
except Exception as e:
logger.warning(f"RCSB entity fetch failed for {protein_name}: {e}")
# Fallback: FASTA endpoint, pick the right chain
try:
r3 = requests.get(f"https://www.rcsb.org/fasta/entry/{pdb_id}/download", timeout=10)
if r3.ok:
seq, capture = "", False
for line in r3.text.strip().split("\n"):
if line.startswith(">"):
capture = f"|Chain {chain}|" in line or f"Chain {chain}" in line
elif capture:
seq += line.strip()
if seq:
return seq
except Exception as e:
logger.warning(f"RCSB FASTA fetch failed for {protein_name}: {e}")
return None
# ── Coloured AA sequence renderer ────────────────────────────────────
def render_sequence_aa(
sequence: str,
displacements: "np.ndarray | None" = None,
mutations: "np.ndarray | None" = None,
title: str = "Amino Acid Sequence",
) -> None:
"""
Render a coloured amino-acid strip in Streamlit.
- Tile colour = AA physicochemical property
- Opacity = predicted displacement magnitude (if provided)
- Red border = known mutation site (if provided)
"""
if not sequence:
st.info("Sequence not available β€” fetching from RCSB failed.")
return
n = len(sequence)
max_disp = float(np.max(displacements)) if displacements is not None and len(displacements) > 0 else 1.0
st.markdown(f"**{title}** β€” {n} residues")
st.markdown("""
<style>
.seq-strip{display:flex;flex-wrap:wrap;gap:2px;margin-bottom:8px;}
.aa-tile{width:22px;height:22px;border-radius:4px;display:flex;align-items:center;
justify-content:center;font-size:10px;font-weight:700;color:white;cursor:default;
border:2px solid transparent;transition:transform .1s;}
.aa-tile:hover{transform:scale(1.35);z-index:10;}
.aa-mut{border:2px solid #f43f5e!important;}
</style>""", unsafe_allow_html=True)
tiles = []
for i, aa in enumerate(sequence):
color = AA_COLORS.get(aa, "#475569")
h = color.lstrip("#")
r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
alpha = (0.35 + 0.65 * float(displacements[i]) / (max_disp + 1e-8)
if displacements is not None and i < len(displacements) else 0.85)
bg = f"rgba({r},{g},{b},{alpha:.2f})"
mut_cls = " aa-mut" if (mutations is not None and i < len(mutations) and mutations[i] > 0) else ""
tip = f"{AA_LABELS.get(aa,aa)}{i+1}"
if displacements is not None and i < len(displacements):
tip += f" Ξ”={displacements[i]:.2f}Γ…"
if mutations is not None and i < len(mutations) and mutations[i] > 0:
tip += f" [{int(mutations[i])} variant(s)]"
tiles.append(f'<div class="aa-tile{mut_cls}" style="background:{bg}" title="{tip}">{aa}</div>')
st.markdown(f'<div class="seq-strip">{"".join(tiles)}</div>', unsafe_allow_html=True)
st.markdown("""
<div style="display:flex;gap:12px;flex-wrap:wrap;font-size:11px;color:#94a3b8;margin-top:2px;">
<span><span style="background:#7c3aed;padding:1px 5px;border-radius:3px;color:white">β– </span> Hydrophobic</span>
<span><span style="background:#0891b2;padding:1px 5px;border-radius:3px;color:white">β– </span> (+) charged</span>
<span><span style="background:#e11d48;padding:1px 5px;border-radius:3px;color:white">β– </span> (βˆ’) charged</span>
<span><span style="background:#0d9488;padding:1px 5px;border-radius:3px;color:white">β– </span> Polar</span>
<span><span style="background:#d97706;padding:1px 5px;border-radius:3px;color:white">β– </span> Special</span>
<span style="color:#f43f5e;">πŸ”΄ border = mutation site Β· opacity = predicted Ξ”</span>
</div>""", unsafe_allow_html=True)
# ── UniProt / EBI mutation fetching ──────────────────────────────────
@st.cache_data(ttl=86400, show_spinner=False)
def get_uniprot_id_from_pdb(pdb_id: str) -> str | None:
"""Map a 4-letter PDB ID to its primary UniProt accession using PDBe API."""
pdb_id = pdb_id[:4].lower()
try:
r = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}", timeout=10)
if r.ok:
data = r.json()
if pdb_id in data and "UniProt" in data[pdb_id]:
return list(data[pdb_id]["UniProt"].keys())[0]
except Exception as e:
logger.warning(f"PDBe Mapping failed for {pdb_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner="Fetching evolutionary mutations...")
def fetch_mutation_frequency(uniprot_id: str, seq_length: int) -> "np.ndarray | None":
"""Fetch known natural variants from EBI and return count per residue."""
try:
r = requests.get(
f"https://www.ebi.ac.uk/proteins/api/variation/{uniprot_id}",
headers={"Accept": "application/json"}, timeout=15)
if not r.ok:
return None
freqs = np.zeros(seq_length)
for f in r.json().get("features", []):
if f.get("type") == "VARIANT":
try:
pos = int(f.get("begin", -1))
if 1 <= pos <= seq_length:
freqs[pos - 1] += 1
except ValueError:
continue
return freqs
except Exception as e:
logger.warning(f"Variation API failed for {uniprot_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner=False)
def get_protein_mutations(protein_name: str, seq_length: int) -> "np.ndarray | None":
"""End-to-end: PDB Name β†’ UniProt β†’ Mutation Frequencies."""
if len(protein_name) >= 4:
uid = get_uniprot_id_from_pdb(protein_name[:4])
if uid:
return fetch_mutation_frequency(uid, seq_length)
return None
# ── PDB structure fetching (for 3D viewer & ATLAS pairs) ─────────────
@st.cache_data(ttl=86400, show_spinner=False)
def fetch_pdb_structure(protein_name: str) -> "str | None":
"""
Download PDB text for a protein like '1A3RH' from RCSB.
Filters to the correct chain. Returns PDB text string or None.
"""
if len(protein_name) < 4:
return None
pdb_id = protein_name[:4].upper()
chain = protein_name[4].upper() if len(protein_name) >= 5 else None
try:
r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb", timeout=20)
if not r.ok:
return None
pdb_text = r.text
if chain:
lines = [l for l in pdb_text.split("\n")
if (l.startswith("ATOM") and len(l) > 21 and l[21] == chain)
or l.startswith(("HEADER", "TITLE", "REMARK", "END"))]
pdb_text = "\n".join(lines)
return pdb_text
except Exception as e:
logger.warning(f"PDB download failed for {pdb_id}: {e}")
return None
@st.cache_data(ttl=86400, show_spinner=False)
def fetch_atlas_pair_info(pair_name: str) -> dict:
"""
Parse an ATLAS-style pair '1A3RH_5W23J' into two conformations
and fetch metadata from RCSB for each.
Returns dict with query/target PDB IDs, chains, titles, and resolution.
"""
parts = pair_name.split("_")
if len(parts) < 2:
return {}
query_name, target_name = parts[0], parts[1]
query_pdb, query_chain = query_name[:4].upper(), (query_name[4].upper() if len(query_name) >= 5 else "A")
target_pdb, target_chain = target_name[:4].upper(), (target_name[4].upper() if len(target_name) >= 5 else "A")
result = {
"pair": pair_name,
"query": {"pdb": query_pdb, "chain": query_chain, "name": query_name},
"target": {"pdb": target_pdb, "chain": target_chain, "name": target_name},
}
for key, pdb_id in [("query", query_pdb), ("target", target_pdb)]:
try:
r = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}", timeout=8)
if r.ok:
d = r.json()
result[key]["title"] = d.get("struct", {}).get("title", "")[:60]
result[key]["resolution"] = (d.get("rcsb_entry_info", {})
.get("resolution_combined", [None]) or [None])[0]
result[key]["method"] = d.get("rcsb_entry_info", {}).get("experimental_method", "")
result[key]["year"] = d.get("rcsb_accession_info", {}).get("deposit_date", "")[:4]
except Exception:
pass
return result