"""External biology APIs (EBI, UniProt, RCSB) for mutation and sequence tracking.""" import requests import numpy as np import streamlit as st import logging logger = logging.getLogger(__name__) # ── Amino acid property colours ────────────────────────────────────── AA_COLORS = { # Hydrophobic "A": "#7c3aed", "V": "#7c3aed", "I": "#6d28d9", "L": "#6d28d9", "M": "#7c3aed", "F": "#5b21b6", "W": "#4c1d95", "P": "#8b5cf6", # Charged positive "K": "#0891b2", "R": "#0e7490", "H": "#06b6d4", # Charged negative "D": "#e11d48", "E": "#be123c", # Polar "S": "#0d9488", "T": "#0f766e", "N": "#115e59", "Q": "#134e4a", # Special "C": "#d97706", "G": "#b45309", "Y": "#92400e", # Unknown "X": "#475569", } AA_LABELS = { "A": "Ala", "V": "Val", "I": "Ile", "L": "Leu", "M": "Met", "F": "Phe", "W": "Trp", "P": "Pro", "K": "Lys", "R": "Arg", "H": "His", "D": "Asp", "E": "Glu", "S": "Ser", "T": "Thr", "N": "Asn", "Q": "Gln", "C": "Cys", "G": "Gly", "Y": "Tyr", "X": "Unk", } # ── Sequence fetching ───────────────────────────────────────────────── @st.cache_data(ttl=86400, show_spinner=False) def get_sequence_from_pdb(protein_name: str) -> str | None: """Fetch amino acid sequence from RCSB for a protein name like '1HO5A'.""" if len(protein_name) < 4: return None pdb_id = protein_name[:4].upper() chain = protein_name[4].upper() if len(protein_name) >= 5 else "A" # Try RCSB REST: entity instance → entity → sequence try: r = requests.get( f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain}", timeout=10) if r.ok: entity_id = (r.json() .get("rcsb_polymer_entity_instance_container_identifiers", {}) .get("entity_id")) if entity_id: r2 = requests.get( f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}", timeout=10) if r2.ok: seq = (r2.json() .get("entity_poly", {}) .get("pdbx_seq_one_letter_code_can", "")) seq = seq.replace("\n", "").strip() if seq: return seq except Exception as e: logger.warning(f"RCSB entity fetch failed for {protein_name}: {e}") # Fallback: FASTA endpoint, pick the right chain try: r3 = requests.get(f"https://www.rcsb.org/fasta/entry/{pdb_id}/download", timeout=10) if r3.ok: seq, capture = "", False for line in r3.text.strip().split("\n"): if line.startswith(">"): capture = f"|Chain {chain}|" in line or f"Chain {chain}" in line elif capture: seq += line.strip() if seq: return seq except Exception as e: logger.warning(f"RCSB FASTA fetch failed for {protein_name}: {e}") return None # ── Coloured AA sequence renderer ──────────────────────────────────── def render_sequence_aa( sequence: str, displacements: "np.ndarray | None" = None, mutations: "np.ndarray | None" = None, title: str = "Amino Acid Sequence", ) -> None: """ Render a coloured amino-acid strip in Streamlit. - Tile colour = AA physicochemical property - Opacity = predicted displacement magnitude (if provided) - Red border = known mutation site (if provided) """ if not sequence: st.info("Sequence not available — fetching from RCSB failed.") return n = len(sequence) max_disp = float(np.max(displacements)) if displacements is not None and len(displacements) > 0 else 1.0 st.markdown(f"**{title}** — {n} residues") st.markdown(""" """, unsafe_allow_html=True) tiles = [] for i, aa in enumerate(sequence): color = AA_COLORS.get(aa, "#475569") h = color.lstrip("#") r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16) alpha = (0.35 + 0.65 * float(displacements[i]) / (max_disp + 1e-8) if displacements is not None and i < len(displacements) else 0.85) bg = f"rgba({r},{g},{b},{alpha:.2f})" mut_cls = " aa-mut" if (mutations is not None and i < len(mutations) and mutations[i] > 0) else "" tip = f"{AA_LABELS.get(aa,aa)}{i+1}" if displacements is not None and i < len(displacements): tip += f" Δ={displacements[i]:.2f}Å" if mutations is not None and i < len(mutations) and mutations[i] > 0: tip += f" [{int(mutations[i])} variant(s)]" tiles.append(f'
{aa}
') st.markdown(f'
{"".join(tiles)}
', unsafe_allow_html=True) st.markdown("""
Hydrophobic (+) charged (−) charged Polar Special 🔴 border = mutation site · opacity = predicted Δ
""", unsafe_allow_html=True) # ── UniProt / EBI mutation fetching ────────────────────────────────── @st.cache_data(ttl=86400, show_spinner=False) def get_uniprot_id_from_pdb(pdb_id: str) -> str | None: """Map a 4-letter PDB ID to its primary UniProt accession using PDBe API.""" pdb_id = pdb_id[:4].lower() try: r = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}", timeout=10) if r.ok: data = r.json() if pdb_id in data and "UniProt" in data[pdb_id]: return list(data[pdb_id]["UniProt"].keys())[0] except Exception as e: logger.warning(f"PDBe Mapping failed for {pdb_id}: {e}") return None @st.cache_data(ttl=86400, show_spinner="Fetching evolutionary mutations...") def fetch_mutation_frequency(uniprot_id: str, seq_length: int) -> "np.ndarray | None": """Fetch known natural variants from EBI and return count per residue.""" try: r = requests.get( f"https://www.ebi.ac.uk/proteins/api/variation/{uniprot_id}", headers={"Accept": "application/json"}, timeout=15) if not r.ok: return None freqs = np.zeros(seq_length) for f in r.json().get("features", []): if f.get("type") == "VARIANT": try: pos = int(f.get("begin", -1)) if 1 <= pos <= seq_length: freqs[pos - 1] += 1 except ValueError: continue return freqs except Exception as e: logger.warning(f"Variation API failed for {uniprot_id}: {e}") return None @st.cache_data(ttl=86400, show_spinner=False) def get_protein_mutations(protein_name: str, seq_length: int) -> "np.ndarray | None": """End-to-end: PDB Name → UniProt → Mutation Frequencies.""" if len(protein_name) >= 4: uid = get_uniprot_id_from_pdb(protein_name[:4]) if uid: return fetch_mutation_frequency(uid, seq_length) return None # ── PDB structure fetching (for 3D viewer & ATLAS pairs) ───────────── @st.cache_data(ttl=86400, show_spinner=False) def fetch_pdb_structure(protein_name: str) -> "str | None": """ Download PDB text for a protein like '1A3RH' from RCSB. Filters to the correct chain. Returns PDB text string or None. """ if len(protein_name) < 4: return None pdb_id = protein_name[:4].upper() chain = protein_name[4].upper() if len(protein_name) >= 5 else None try: r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb", timeout=20) if not r.ok: return None pdb_text = r.text if chain: lines = [l for l in pdb_text.split("\n") if (l.startswith("ATOM") and len(l) > 21 and l[21] == chain) or l.startswith(("HEADER", "TITLE", "REMARK", "END"))] pdb_text = "\n".join(lines) return pdb_text except Exception as e: logger.warning(f"PDB download failed for {pdb_id}: {e}") return None @st.cache_data(ttl=86400, show_spinner=False) def fetch_atlas_pair_info(pair_name: str) -> dict: """ Parse an ATLAS-style pair '1A3RH_5W23J' into two conformations and fetch metadata from RCSB for each. Returns dict with query/target PDB IDs, chains, titles, and resolution. """ parts = pair_name.split("_") if len(parts) < 2: return {} query_name, target_name = parts[0], parts[1] query_pdb, query_chain = query_name[:4].upper(), (query_name[4].upper() if len(query_name) >= 5 else "A") target_pdb, target_chain = target_name[:4].upper(), (target_name[4].upper() if len(target_name) >= 5 else "A") result = { "pair": pair_name, "query": {"pdb": query_pdb, "chain": query_chain, "name": query_name}, "target": {"pdb": target_pdb, "chain": target_chain, "name": target_name}, } for key, pdb_id in [("query", query_pdb), ("target", target_pdb)]: try: r = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}", timeout=8) if r.ok: d = r.json() result[key]["title"] = d.get("struct", {}).get("title", "")[:60] result[key]["resolution"] = (d.get("rcsb_entry_info", {}) .get("resolution_combined", [None]) or [None])[0] result[key]["method"] = d.get("rcsb_entry_info", {}).get("experimental_method", "") result[key]["year"] = d.get("rcsb_accession_info", {}).get("deposit_date", "")[:4] except Exception: pass return result