Spaces:

Valmbd
/

Petimot

Running

File size: 11,172 Bytes

"""External biology APIs (EBI, UniProt, RCSB) for mutation and sequence tracking."""
import requests
import numpy as np
import streamlit as st
import logging

logger = logging.getLogger(__name__)

# ── Amino acid property colours ──────────────────────────────────────
AA_COLORS = {
    # Hydrophobic
    "A": "#7c3aed", "V": "#7c3aed", "I": "#6d28d9", "L": "#6d28d9",
    "M": "#7c3aed", "F": "#5b21b6", "W": "#4c1d95", "P": "#8b5cf6",
    # Charged positive
    "K": "#0891b2", "R": "#0e7490", "H": "#06b6d4",
    # Charged negative
    "D": "#e11d48", "E": "#be123c",
    # Polar
    "S": "#0d9488", "T": "#0f766e", "N": "#115e59", "Q": "#134e4a",
    # Special
    "C": "#d97706", "G": "#b45309", "Y": "#92400e",
    # Unknown
    "X": "#475569",
}

AA_LABELS = {
    "A": "Ala", "V": "Val", "I": "Ile", "L": "Leu", "M": "Met",
    "F": "Phe", "W": "Trp", "P": "Pro", "K": "Lys", "R": "Arg",
    "H": "His", "D": "Asp", "E": "Glu", "S": "Ser", "T": "Thr",
    "N": "Asn", "Q": "Gln", "C": "Cys", "G": "Gly", "Y": "Tyr", "X": "Unk",
}


# ── Sequence fetching ─────────────────────────────────────────────────
@st.cache_data(ttl=86400, show_spinner=False)
def get_sequence_from_pdb(protein_name: str) -> str | None:
    """Fetch amino acid sequence from RCSB for a protein name like '1HO5A'."""
    if len(protein_name) < 4:
        return None
    pdb_id = protein_name[:4].upper()
    chain  = protein_name[4].upper() if len(protein_name) >= 5 else "A"

    # Try RCSB REST: entity instance → entity → sequence
    try:
        r = requests.get(
            f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain}",
            timeout=10)
        if r.ok:
            entity_id = (r.json()
                         .get("rcsb_polymer_entity_instance_container_identifiers", {})
                         .get("entity_id"))
            if entity_id:
                r2 = requests.get(
                    f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}",
                    timeout=10)
                if r2.ok:
                    seq = (r2.json()
                           .get("entity_poly", {})
                           .get("pdbx_seq_one_letter_code_can", ""))
                    seq = seq.replace("\n", "").strip()
                    if seq:
                        return seq
    except Exception as e:
        logger.warning(f"RCSB entity fetch failed for {protein_name}: {e}")

    # Fallback: FASTA endpoint, pick the right chain
    try:
        r3 = requests.get(f"https://www.rcsb.org/fasta/entry/{pdb_id}/download", timeout=10)
        if r3.ok:
            seq, capture = "", False
            for line in r3.text.strip().split("\n"):
                if line.startswith(">"):
                    capture = f"|Chain {chain}|" in line or f"Chain {chain}" in line
                elif capture:
                    seq += line.strip()
            if seq:
                return seq
    except Exception as e:
        logger.warning(f"RCSB FASTA fetch failed for {protein_name}: {e}")
    return None


# ── Coloured AA sequence renderer ────────────────────────────────────
def render_sequence_aa(
    sequence: str,
    displacements: "np.ndarray | None" = None,
    mutations: "np.ndarray | None" = None,
    title: str = "Amino Acid Sequence",
) -> None:
    """
    Render a coloured amino-acid strip in Streamlit.
    - Tile colour = AA physicochemical property
    - Opacity = predicted displacement magnitude (if provided)
    - Red border = known mutation site (if provided)
    """
    if not sequence:
        st.info("Sequence not available — fetching from RCSB failed.")
        return

    n = len(sequence)
    max_disp = float(np.max(displacements)) if displacements is not None and len(displacements) > 0 else 1.0

    st.markdown(f"**{title}** — {n} residues")
    st.markdown("""
<style>
.seq-strip{display:flex;flex-wrap:wrap;gap:2px;margin-bottom:8px;}
.aa-tile{width:22px;height:22px;border-radius:4px;display:flex;align-items:center;
  justify-content:center;font-size:10px;font-weight:700;color:white;cursor:default;
  border:2px solid transparent;transition:transform .1s;}
.aa-tile:hover{transform:scale(1.35);z-index:10;}
.aa-mut{border:2px solid #f43f5e!important;}
</style>""", unsafe_allow_html=True)

    tiles = []
    for i, aa in enumerate(sequence):
        color  = AA_COLORS.get(aa, "#475569")
        h      = color.lstrip("#")
        r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
        alpha  = (0.35 + 0.65 * float(displacements[i]) / (max_disp + 1e-8)
                  if displacements is not None and i < len(displacements) else 0.85)
        bg     = f"rgba({r},{g},{b},{alpha:.2f})"
        mut_cls = " aa-mut" if (mutations is not None and i < len(mutations) and mutations[i] > 0) else ""
        tip    = f"{AA_LABELS.get(aa,aa)}{i+1}"
        if displacements is not None and i < len(displacements):
            tip += f" Δ={displacements[i]:.2f}Å"
        if mutations is not None and i < len(mutations) and mutations[i] > 0:
            tip += f" [{int(mutations[i])} variant(s)]"
        tiles.append(f'<div class="aa-tile{mut_cls}" style="background:{bg}" title="{tip}">{aa}</div>')

    st.markdown(f'<div class="seq-strip">{"".join(tiles)}</div>', unsafe_allow_html=True)
    st.markdown("""
<div style="display:flex;gap:12px;flex-wrap:wrap;font-size:11px;color:#94a3b8;margin-top:2px;">
<span><span style="background:#7c3aed;padding:1px 5px;border-radius:3px;color:white">■</span> Hydrophobic</span>
<span><span style="background:#0891b2;padding:1px 5px;border-radius:3px;color:white">■</span> (+) charged</span>
<span><span style="background:#e11d48;padding:1px 5px;border-radius:3px;color:white">■</span> (−) charged</span>
<span><span style="background:#0d9488;padding:1px 5px;border-radius:3px;color:white">■</span> Polar</span>
<span><span style="background:#d97706;padding:1px 5px;border-radius:3px;color:white">■</span> Special</span>
<span style="color:#f43f5e;">🔴 border = mutation site · opacity = predicted Δ</span>
</div>""", unsafe_allow_html=True)


# ── UniProt / EBI mutation fetching ──────────────────────────────────
@st.cache_data(ttl=86400, show_spinner=False)
def get_uniprot_id_from_pdb(pdb_id: str) -> str | None:
    """Map a 4-letter PDB ID to its primary UniProt accession using PDBe API."""
    pdb_id = pdb_id[:4].lower()
    try:
        r = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}", timeout=10)
        if r.ok:
            data = r.json()
            if pdb_id in data and "UniProt" in data[pdb_id]:
                return list(data[pdb_id]["UniProt"].keys())[0]
    except Exception as e:
        logger.warning(f"PDBe Mapping failed for {pdb_id}: {e}")
    return None


@st.cache_data(ttl=86400, show_spinner="Fetching evolutionary mutations...")
def fetch_mutation_frequency(uniprot_id: str, seq_length: int) -> "np.ndarray | None":
    """Fetch known natural variants from EBI and return count per residue."""
    try:
        r = requests.get(
            f"https://www.ebi.ac.uk/proteins/api/variation/{uniprot_id}",
            headers={"Accept": "application/json"}, timeout=15)
        if not r.ok:
            return None
        freqs = np.zeros(seq_length)
        for f in r.json().get("features", []):
            if f.get("type") == "VARIANT":
                try:
                    pos = int(f.get("begin", -1))
                    if 1 <= pos <= seq_length:
                        freqs[pos - 1] += 1
                except ValueError:
                    continue
        return freqs
    except Exception as e:
        logger.warning(f"Variation API failed for {uniprot_id}: {e}")
    return None


@st.cache_data(ttl=86400, show_spinner=False)
def get_protein_mutations(protein_name: str, seq_length: int) -> "np.ndarray | None":
    """End-to-end: PDB Name → UniProt → Mutation Frequencies."""
    if len(protein_name) >= 4:
        uid = get_uniprot_id_from_pdb(protein_name[:4])
        if uid:
            return fetch_mutation_frequency(uid, seq_length)
    return None


# ── PDB structure fetching (for 3D viewer & ATLAS pairs) ─────────────
@st.cache_data(ttl=86400, show_spinner=False)
def fetch_pdb_structure(protein_name: str) -> "str | None":
    """
    Download PDB text for a protein like '1A3RH' from RCSB.
    Filters to the correct chain. Returns PDB text string or None.
    """
    if len(protein_name) < 4:
        return None
    pdb_id = protein_name[:4].upper()
    chain  = protein_name[4].upper() if len(protein_name) >= 5 else None
    try:
        r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb", timeout=20)
        if not r.ok:
            return None
        pdb_text = r.text
        if chain:
            lines = [l for l in pdb_text.split("\n")
                     if (l.startswith("ATOM") and len(l) > 21 and l[21] == chain)
                     or l.startswith(("HEADER", "TITLE", "REMARK", "END"))]
            pdb_text = "\n".join(lines)
        return pdb_text
    except Exception as e:
        logger.warning(f"PDB download failed for {pdb_id}: {e}")
        return None


@st.cache_data(ttl=86400, show_spinner=False)
def fetch_atlas_pair_info(pair_name: str) -> dict:
    """
    Parse an ATLAS-style pair '1A3RH_5W23J' into two conformations
    and fetch metadata from RCSB for each.
    Returns dict with query/target PDB IDs, chains, titles, and resolution.
    """
    parts = pair_name.split("_")
    if len(parts) < 2:
        return {}
    query_name, target_name = parts[0], parts[1]
    query_pdb, query_chain   = query_name[:4].upper(),  (query_name[4].upper()  if len(query_name)  >= 5 else "A")
    target_pdb, target_chain = target_name[:4].upper(), (target_name[4].upper() if len(target_name) >= 5 else "A")

    result = {
        "pair":   pair_name,
        "query":  {"pdb": query_pdb,  "chain": query_chain,  "name": query_name},
        "target": {"pdb": target_pdb, "chain": target_chain, "name": target_name},
    }
    for key, pdb_id in [("query", query_pdb), ("target", target_pdb)]:
        try:
            r = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}", timeout=8)
            if r.ok:
                d = r.json()
                result[key]["title"]      = d.get("struct", {}).get("title", "")[:60]
                result[key]["resolution"] = (d.get("rcsb_entry_info", {})
                                              .get("resolution_combined", [None]) or [None])[0]
                result[key]["method"]     = d.get("rcsb_entry_info", {}).get("experimental_method", "")
                result[key]["year"]       = d.get("rcsb_accession_info", {}).get("deposit_date", "")[:4]
        except Exception:
            pass
    return result