Spaces:

ym59
/

VeloBind

Running

App Files Files Community

ym59 commited on Mar 16

Commit

5797e4d

verified ·

1 Parent(s): 26d67e0

Update app.py

Browse files

Files changed (1) hide show

app.py +551 -257

app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import warnings
 warnings.filterwarnings("ignore")
-import os, time
 from pathlib import Path
 from io import BytesIO
-import base64
 import numpy as np
 import pandas as pd
 import torch
@@ -15,12 +18,18 @@ import matplotlib.patches as mpatches
 import streamlit as st
 try:
     from rdkit import RDLogger
     RDLogger.DisableLog("rdApp.*")
-except:
     pass
 # ─── Page config ────────────────────────────────────────────────
 st.set_page_config(
     page_title="VeloBind",
@@ -222,228 +231,474 @@ hr { border: none !important; border-top: 1px solid var(--border) !important; ma
 </style>
 """, unsafe_allow_html=True)
 # ─── Model loading ───────────────────────────────────────────────
 @st.cache_resource(show_spinner=False)
-def load_models():
     try:
         import joblib
-        fold_models = {}
         meta = iso_cal = lig_scaler = None
         train_embs = None
         ad_threshold = 1.4
         target_mu, target_std = 6.361, 1.855
-        model_dir = Path("output/models")
-        if not model_dir.exists() or not any(model_dir.glob("*.pkl")):
-            from huggingface_hub import snapshot_download
-            snapshot_download(repo_id="ym59/velobind-models", repo_type="dataset", local_dir=".")
-        MODEL_DIR = Path("output/models")
-        PREP_DIR  = Path("output/preprocessors")
-        seeds, n_folds, mtypes = [42,123,456], 5, ["lgbm","cb","xgb"]
         if MODEL_DIR.exists():
             for seed in seeds:
                 for mt in mtypes:
                     for fold in range(n_folds):
-                        k = f"s{seed}_{mt}_f{fold}"
-                        p = MODEL_DIR / f"fold_model_{k}.pkl"
-                        if p.exists(): fold_models[k] = joblib.load(p)
-            for fname, attr in [("meta_all_casf16.pkl","meta"),("isotonic_calibrator.pkl","iso")]:
                 p = MODEL_DIR / fname
                 if p.exists():
-                    obj = joblib.load(p)
-                    if attr=="meta": meta=obj
-                    else: iso_cal=obj
             ts = MODEL_DIR / "target_scaler.pkl"
             if ts.exists():
-                t = joblib.load(ts); target_mu=t.mu; target_std=t.std
         if PREP_DIR.exists():
-            ls = PREP_DIR/"ligand_scaler.pkl"
-            if ls.exists(): lig_scaler=joblib.load(ls)
-        ad = Path("output/ad_train_embeddings.npy")
-        if ad.exists():
-            train_embs=np.load(str(ad))
-            at=Path("output/ad_threshold.npy")
-            if at.exists(): ad_threshold=float(np.load(str(at)))
-        return fold_models,meta,iso_cal,lig_scaler,train_embs,ad_threshold,target_mu,target_std
     except Exception as e:
-        return {},None,None,None,None,1.4,6.361,1.855
 @st.cache_resource(show_spinner=False)
 def load_esm():
     from transformers import AutoTokenizer, EsmModel
-    tok=AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
-    model=EsmModel.from_pretrained("facebook/esm2_t12_35M_UR50D")
-    model.eval(); return tok, model
 @st.cache_data(show_spinner=False)
-def embed_sequence(seq: str):
     tok, model = load_esm()
     MAX, HALF = 1022, 511
-    def _chunk(s):
-        enc=tok(s,return_tensors="pt",truncation=False)
         with torch.no_grad():
-            out=model(**enc,output_hidden_states=True)
-        hs=out.hidden_states; mask=enc["attention_mask"].unsqueeze(-1).float()
-        mvecs=[]
-        for li in [8,10,11]:
-            h=hs[li]; mv=(h*mask).sum(1)/mask.sum(1).clamp(min=1e-9)
-            mvecs.append(mv.squeeze(0).numpy())
         return np.concatenate(mvecs)
-    if len(seq)<=MAX: return _chunk(seq)
-    return (_chunk(seq[:HALF])+_chunk(seq[-HALF:]))/2
-def seq_features(seq):
     try:
         from Bio.SeqUtils.ProtParam import ProteinAnalysis
-        pa=ProteinAnalysis(seq.upper())
-        pp=[pa.molecular_weight(),pa.aromaticity(),pa.instability_index(),
-            pa.isoelectric_point(),pa.gravy(),*pa.secondary_structure_fraction(),
-            *list(pa.amino_acids_percent.values())]
-    except: pp=[0.0]*28
-    AA=list("ACDEFGHIKLMNPQRSTVWY")
-    dp={a+b:0 for a in AA for b in AA}
-    for i in range(len(seq)-1):
-        k=seq[i].upper()+seq[i+1].upper()
-        if k in dp: dp[k]+=1
-    tot=max(1,sum(dp.values())); dpc=[v/tot for v in dp.values()]
     try:
-        from src.features.protein import _ctd,_conjoint_triad,_qso,_aaindex_encoding
-        extra=list(_ctd(seq))+list(_conjoint_triad(seq))+list(_qso(seq))+list(_aaindex_encoding(seq))
-    except: extra=[0.0]*(63+343+60+25)
-    return np.array(pp+dpc+extra,dtype=np.float32)
-def ligand_features(smiles):
     try:
         from rdkit import Chem
-        from rdkit.Chem import AllChem,MACCSkeys,Descriptors,DataStructs
-        from rdkit.Chem.rdMolDescriptors import (GetHashedAtomPairFingerprint,
-                                                  GetHashedTopologicalTorsionFingerprint)
-        mol=Chem.MolFromSmiles(smiles)
-        if mol is None: return None,"Invalid SMILES"
-        def fp(obj,n):
-            a=np.zeros(n,dtype=np.float32); DataStructs.ConvertToNumpyArray(obj,a); return a
-        ecfp2=fp(AllChem.GetMorganFingerprintAsBitVect(mol,1,1024),1024)
-        ecfp4=fp(AllChem.GetMorganFingerprintAsBitVect(mol,2,1024),1024)
-        ecfp6=fp(AllChem.GetMorganFingerprintAsBitVect(mol,3,1024),1024)
-        fcfp4=fp(AllChem.GetMorganFingerprintAsBitVect(mol,2,1024,useFeatures=True),1024)
-        maccs=fp(MACCSkeys.GenMACCSKeys(mol),167)
-        ap=np.zeros(2048,dtype=np.float32)
-        DataStructs.ConvertToNumpyArray(GetHashedAtomPairFingerprint(mol,2048),ap)
-        tors=np.zeros(2048,dtype=np.float32)
-        DataStructs.ConvertToNumpyArray(GetHashedTopologicalTorsionFingerprint(mol,2048),tors)
         try:
             from rdkit.Chem.EState.Fingerprinter import FingerprintMol
-            es=np.nan_to_num(np.clip(FingerprintMol(mol)[0].astype(np.float32),-1e6,1e6))[:79]
-            if len(es)<79: es=np.pad(es,(0,79-len(es)))
-        except: es=np.zeros(79,dtype=np.float32)
-        desc_fns=[v for k,v in sorted(Descriptors.descList)][:217]
-        phys=[]
         for fn in desc_fns:
             try:
-                v=float(fn(mol)); phys.append(0.0 if(not np.isfinite(v) or abs(v)>1e10) else v)
-            except: phys.append(0.0)
-        return {"ecfp2":ecfp2,"ecfp":ecfp4,"ecfp6":ecfp6,"fcfp":fcfp4,
-                "maccs":maccs,"ap":ap,"torsion":tors,
-                "estate":es,"phys":np.array(phys,dtype=np.float64)},None
-    except Exception as e: return None,str(e)
-def assemble(esm_mean,seqfeat,lig,lig_scaler):
-    esm_last=esm_mean[-480:]
     if lig_scaler is not None:
         try:
-            combined=np.concatenate([lig["estate"],lig["phys"]])
-            combined=lig_scaler.transform(combined.reshape(1,-1)).ravel()
-            es=combined[:79].astype(np.float32); ph=combined[79:].astype(np.float32)
-        except: es,ph=lig["estate"],lig["phys"].astype(np.float32)
-    else: es,ph=lig["estate"],lig["phys"].astype(np.float32)
-    return np.concatenate([esm_last,seqfeat,lig["ecfp"],lig["ecfp2"],lig["ecfp6"],
-                           lig["fcfp"],es,lig["maccs"],lig["ap"],lig["torsion"],ph]).astype(np.float32)
-def predict_pkd(X,fold_models,meta,iso_cal,target_mu,target_std):
-    if not fold_models: return None,None,None
-    seeds,n_folds,mtypes=[42,123,456],5,["lgbm","cb","xgb"]
-    mat=np.zeros((1,len(seeds)*len(mtypes))); col=0
     for seed in seeds:
         for mt in mtypes:
-            preds=[fold_models[f"s{seed}_{mt}_f{f}"].predict(X.reshape(1,-1))[0]
-                   for f in range(n_folds) if f"s{seed}_{mt}_f{f}" in fold_models]
-            if preds: mat[0,col]=np.mean(preds)*target_std+target_mu
-            col+=1
-    pred=float(meta.predict(mat)[0]) if meta else float(mat[mat!=0].mean())
-    if iso_cal: pred=float(iso_cal.predict([pred])[0])
-    nz=mat[mat!=0]; spread=float(nz.std()) if len(nz)>1 else 0.5
-    return pred,pred-1.96*spread,pred+1.96*spread
-def check_ad(esm_mean,train_embs,ad_threshold):
-    if train_embs is None: return True,0.0
-    from sklearn.metrics.pairwise import cosine_distances
-    q=esm_mean[-480:].reshape(1,-1); d=cosine_distances(q,train_embs[:2000])[0]
-    k=float(np.sort(d)[:5].mean()); return k<=ad_threshold,k
-def clean_fasta(s):
-    s=s.strip()
     if s.startswith(">"):
         return "".join(l.strip() for l in s.split("\n") if not l.startswith(">"))
-    return s.replace(" ","").replace("\n","")
-def pkd_to_ki(pkd):
-    m=10**(-pkd)
-    if m<1e-9:  return f"{m*1e12:.1f} pM"
-    if m<1e-6:  return f"{m*1e9:.1f} nM"
-    if m<1e-3:  return f"{m*1e6:.1f} uM"
-    return f"{m*1e3:.1f} mM"
-def xai_chart(smiles,pkd):
     try:
-        from rdkit import Chem; from rdkit.Chem import Descriptors
-        mol=Chem.MolFromSmiles(smiles)
-        if mol is None: return None
-        features={
-            "MW / atom count":       +0.12*min((mol.GetNumHeavyAtoms()-25)/20,1.0),
-            "LogP (hydrophobicity)": +0.18*min((Descriptors.MolLogP(mol)-2)/3,1.0),
-            "H-bond donors":         -0.09*max(Descriptors.NumHDonors(mol)-2,0),
-            "H-bond acceptors":      +0.11*min(Descriptors.NumHAcceptors(mol)/5,1.0),
-            "TPSA (polarity)":       -0.10*max((Descriptors.TPSA(mol)-70)/50,0),
-            "Aromatic rings":        +0.15*min(Descriptors.NumAromaticRings(mol)/3,1.0),
-            "Rotatable bonds":       -0.07*max((Descriptors.NumRotatableBonds(mol)-5)/5,0),
-            "ESM-2 protein repr":    (pkd-6.36)*0.4,
         }
-        items=sorted(features.items(),key=lambda x:abs(x[1]),reverse=True)[:8]
-        labels=[i[0] for i in items]; values=[i[1] for i in items]
-        baseline=6.36; running=baseline
-        lefts,widths,colors,rvals=[],[],[],[]
         for v in values:
-            lefts.append(min(running,running+v)); widths.append(abs(v))
-            colors.append("#C9933A" if v>=0 else "#E05252")
-            running+=v; rvals.append(running)
-        fig,ax=plt.subplots(figsize=(7.2,3.8))
-        fig.patch.set_facecolor("#0D1627"); ax.set_facecolor("#0D1627")
-        ax.barh(range(len(labels)),widths,left=lefts,color=colors,height=0.50,alpha=0.90,edgecolor="none")
-        ax.axvline(baseline,color="#243858",lw=1.1,ls="--",alpha=0.9)
-        ax.axvline(pkd,color="#C9933A",lw=1.5,ls="-",alpha=0.9)
-        for i,(rv,v) in enumerate(zip(rvals,values)):
-            sign="+" if v>=0 else ""
-            ax.text(rv+0.012*(1 if v>=0 else -1),i,f"{sign}{v:.2f}",va="center",
-                    ha="left" if v>=0 else "right",fontsize=8.5,color="#B8CCE0",fontfamily="monospace")
-        ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels,fontsize=9,color="#7A9ABE")
-        ax.set_xlabel("pKd contribution",fontsize=9,color="#3D5878",labelpad=7)
-        ax.tick_params(axis="x",colors="#243858",labelsize=8.5,labelcolor="#7A9ABE")
-        ax.tick_params(axis="y",length=0)
-        for sp in ax.spines.values(): sp.set_visible(False)
-        ax.grid(axis="x",color="#162035",lw=0.7,alpha=0.9)
-        pos_p=mpatches.Patch(color="#C9933A",label="Increases pKd")
-        neg_p=mpatches.Patch(color="#E05252",label="Decreases pKd")
-        ax.legend(handles=[pos_p,neg_p],loc="lower right",fontsize=8,
-                  facecolor="#0D1627",edgecolor="#1C2E48",labelcolor="#7A9ABE",framealpha=0.95)
-        ax.text(pkd,-0.9,f"  pKd = {pkd:.2f}",color="#C9933A",fontsize=8.5,va="top",fontfamily="monospace")
-        ax.text(baseline,-0.9,f"  base = {baseline:.2f}",color="#3D5878",fontsize=8,va="top",fontfamily="monospace")
-        plt.tight_layout(pad=0.6); return fig
-    except: return None
-# ─── HTML helpers ────────────────────────────────────────────────
-def metric_card(label, value, accent=False):
-    border="rgba(201,147,58,0.35)" if accent else "#1C2E48"
-    bg="linear-gradient(135deg,#111E33 0%,rgba(201,147,58,0.04) 100%)" if accent else "#111E33"
-    vc="#C9933A" if accent else "#DCE8F8"
     return st.markdown(f"""
     <div style="background:{bg};border:1px solid {border};border-radius:8px;
                 padding:17px 14px;text-align:center;box-shadow:0 1px 5px rgba(0,0,0,0.4)">
@@ -453,11 +708,16 @@ def metric_card(label, value, accent=False):
                   font-family:'Outfit',sans-serif">{label}</div>
     </div>""", unsafe_allow_html=True)
-def ad_badge(in_domain, dist):
     if in_domain:
-        c,bc="#2ABFB3","rgba(42,191,179,0.12)"; bc2="rgba(42,191,179,0.22)"; txt="IN DOMAIN"
     else:
-        c,bc="#E05252","rgba(224,82,82,0.10)"; bc2="rgba(224,82,82,0.22)"; txt="OUT OF DOMAIN"
     return st.markdown(f"""
     <div style="background:#111E33;border:1px solid #1C2E48;border-radius:8px;
                 padding:17px 14px;text-align:center;box-shadow:0 1px 5px rgba(0,0,0,0.4)">
@@ -472,7 +732,8 @@ def ad_badge(in_domain, dist):
                   font-family:'Outfit',sans-serif;margin-top:5px">Applicability domain</div>
     </div>""", unsafe_allow_html=True)
-def card_wrap(content_fn, head, sub=None):
     st.markdown(f"""
     <div style="background:#111E33;border:1px solid #1C2E48;border-radius:8px;
                 padding:16px 18px 4px 18px;box-shadow:0 1px 4px rgba(0,0,0,0.4)">
@@ -482,34 +743,34 @@ def card_wrap(content_fn, head, sub=None):
         {f'<span style="font-family:monospace;font-size:10px;color:#3D5878;margin-left:6px;font-weight:400">{sub}</span>' if sub else ''}
       </div>
     </div>""", unsafe_allow_html=True)
-    # Overlay the widget on top using negative margin trick
     st.markdown("""<div style="background:#111E33;border:1px solid #1C2E48;border-top:none;
                     border-radius:0 0 8px 8px;padding:0 18px 16px;margin-top:-4px;
                     box-shadow:0 2px 6px rgba(0,0,0,0.3)">""", unsafe_allow_html=True)
     content_fn()
     st.markdown("</div>", unsafe_allow_html=True)
-# ─── Example data ────────────────────────────────────────────────
 SEQS = {
     "EGFR kinase": "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACIDRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPSRDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGSTAENAEYLRVAPQSSEFIGA",
-    "HIV protease":  "PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF",
-    "Thrombin":      "MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANTFLEEVRKGNLERECVEETCSYEEAFEALESSTATDVFWAKYTACETARTPRDKLAACLEGNCAEGLGTNYRGHVNITRSGIECQLWRSRYPHKPEINSTTHPGADLQENFCRNPDSSTTGPWCYTTDPTVRRQECSIPVCGQDQVTVAMTPRSEGSSVNLSPPLEQCVPDRGQQYQLRPVQPFLNQLREIFNMAR",
 }
 SMIS = {
     "Erlotinib": "CCOc1cc2c(cc1OCC)ncnc2Nc1cccc(Cl)c1",
-    "Imatinib":  "Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc(-c2cccnc2)n1",
     "Indinavir": "OC[C@@H](NC(=O)[C@@H]1CN(Cc2cccnc2)C[C@H]1NC(=O)[C@@H](CC(C)C)NC(=O)c1cc2ccccc2[nH]1)Cc1ccccc1",
 }
 # ─── Init session state ───────────────────────────────────────────
-for k,v in [("seq_val",""),("smi_val",""),("bseq_val",""),
-            ("ssel_val",""),("sseqs_val","")]:
     if k not in st.session_state:
         st.session_state[k] = v
 # ─── Load models ─────────────────────────────────────────────────
 with st.spinner("Loading VeloBind models..."):
-    fold_models,meta,iso_cal,lig_scaler,train_embs,ad_threshold,target_mu,target_std = load_models()
 n_loaded = len(fold_models)
 # ─── HEADER ──────────────────────────────────────────────────────
@@ -542,8 +803,10 @@ st.markdown("""
 # ─── PAGE TITLE ───────────────────────────────────────────────────
 col_logo, col_title = st.columns([1, 11], gap="small")
 with col_logo:
-    try: st.image("static/logo.png", width=72)
-    except: pass
 with col_title:
     st.markdown("""
     <div style="padding-top:4px">
@@ -568,7 +831,7 @@ st.markdown("""
 </div>
 """, unsafe_allow_html=True)
-# ─── TABS ────────────────────────────────────────────────────────
 tab1, tab2, tab3 = st.tabs(["Single Query", "Batch Screening", "Selectivity Profile"])
 # ════════════════ TAB 1: SINGLE ══════════════════════════════════
@@ -593,7 +856,7 @@ with tab1:
         st.markdown('<p style="font-size:10.5px;color:#3D5878;margin:8px 0 4px">Load example:</p>', unsafe_allow_html=True)
         ex_cols = st.columns(3)
-        for i,(name,seq) in enumerate(SEQS.items()):
             with ex_cols[i]:
                 st.markdown('<div class="pill-btn">', unsafe_allow_html=True)
                 if st.button(name, key=f"seq_ex_{i}"):
@@ -619,7 +882,7 @@ with tab1:
         st.markdown('<p style="font-size:10.5px;color:#3D5878;margin:8px 0 4px">Load example:</p>', unsafe_allow_html=True)
         sm_cols = st.columns(3)
-        for i,(name,smi) in enumerate(SMIS.items()):
             with sm_cols[i]:
                 st.markdown('<div class="pill-btn">', unsafe_allow_html=True)
                 if st.button(name, key=f"smi_ex_{i}"):
@@ -630,8 +893,10 @@ with tab1:
     if st.button("Predict Binding Affinity", key="run_single", type="primary"):
         seq = clean_fasta(seq_input)
         smi = smi_input.strip()
-        if not seq: st.error("Please enter a protein sequence.")
-        elif not smi: st.error("Please enter a SMILES string.")
         else:
             t0 = time.time()
             with st.spinner("Running ESM-2 embedding..."):
@@ -643,20 +908,27 @@ with tab1:
                 st.error(f"Ligand error: {err}")
             else:
                 with st.spinner("Running 45-model ensemble..."):
-                    X = assemble(esm_mean,seqfeat,lig,lig_scaler)
-                    pkd,ci_lo,ci_hi = predict_pkd(X,fold_models,meta,iso_cal,target_mu,target_std)
                 if pkd is None:
-                    import random; random.seed(hash(seq[:20]+smi[:20])%2**31)
-                    pkd=random.uniform(5.5,9.0); ci_lo=pkd-0.8; ci_hi=pkd+0.8
-                in_domain,ad_dist = check_ad(esm_mean,train_embs,ad_threshold)
-                elapsed = round(time.time()-t0,1)
                 st.markdown("<hr>", unsafe_allow_html=True)
-                mc1,mc2,mc3,mc4 = st.columns(4)
-                with mc1: metric_card("Predicted pKd", f"{pkd:.2f}", accent=True)
-                with mc2: metric_card("95% model interval", f"[{ci_lo:.2f}, {ci_hi:.2f}]")
-                with mc3: metric_card("Estimated Ki", pkd_to_ki(pkd))
-                with mc4: ad_badge(in_domain, ad_dist)
                 st.markdown("""
                 <div style="background:#111E33;border:1px solid #1C2E48;border-radius:8px;
@@ -674,9 +946,10 @@ with tab1:
                       SHAP / LightGBM</span>
                   </div>
                 """, unsafe_allow_html=True)
-                fig = xai_chart(smi,pkd)
                 if fig:
-                    st.pyplot(fig,use_container_width=True); plt.close(fig)
                 st.markdown("</div>", unsafe_allow_html=True)
                 st.markdown(f"""
@@ -731,47 +1004,61 @@ with tab2:
     if st.button("Run Batch Screening", key="run_batch", type="primary"):
         seq = clean_fasta(batch_seq)
-        if not seq: st.error("Please enter a protein sequence.")
-        elif uploaded is None: st.error("Please upload a CSV file.")
         else:
             try:
                 df = pd.read_csv(uploaded)
-                col = next((c for c in df.columns if c.lower() in
-                            ("smiles","smile","smi","canonical_smiles")), None)
-                if col is None: st.error("No 'smiles' column found.")
                 else:
                     df = df.head(500)
-                    name_col = next((c for c in df.columns if c.lower() in
-                                     ("name","compound_name","id","molecule_name")), None)
                     with st.spinner("Embedding protein..."):
                         esm_mean = embed_sequence(seq)
-                        seqfeat  = seq_features(seq)
-                        in_domain,_ = check_ad(esm_mean,train_embs,ad_threshold)
                     results = []
                     prog = st.progress(0, text="Screening...")
-                    for idx,row in df.iterrows():
-                        smi  = str(row[col]).strip()
                         name = str(row[name_col]).strip() if name_col else ""
                         try:
-                            lig,err = ligand_features(smi)
-                            if err: continue
-                            X = assemble(esm_mean,seqfeat,lig,lig_scaler)
-                            pkd,ci_lo,ci_hi = predict_pkd(X,fold_models,meta,iso_cal,target_mu,target_std)
                             if pkd is None:
-                                import random; random.seed(hash(smi)%2**31)
-                                pkd=random.uniform(5.0,9.0); ci_lo=pkd-0.8; ci_hi=pkd+0.8
-                            results.append({"Name":name,"SMILES":smi,"pKd":round(pkd,3),
-                                            "95% CI":f"[{ci_lo:.2f}, {ci_hi:.2f}]",
-                                            "Ki":pkd_to_ki(pkd),"In_domain":in_domain})
-                        except: continue
-                        prog.progress(min(int(len(results)/len(df)*100),100),
-                                       text=f"{len(results)}/{len(df)} compounds screened")
                     prog.empty()
                     if results:
-                        res_df = pd.DataFrame(results).sort_values("pKd",ascending=False)
-                        res_df.insert(0,"Rank",range(1,len(res_df)+1))
                         st.markdown("<hr>", unsafe_allow_html=True)
-                        rh,rd = st.columns([5,1])
                         with rh:
                             st.markdown(f"""<div style="font-family:'Source Serif 4',serif;
                                             font-size:18px;font-weight:600;color:#DCE8F8">
@@ -780,9 +1067,9 @@ with tab2:
                                             font-family:monospace">({len(res_df)} compounds)</span>
                                             </div>""", unsafe_allow_html=True)
                         with rd:
-                            st.download_button("Download CSV",res_df.to_csv(index=False),
-                                               "velobind_results.csv","text/csv")
-                        st.dataframe(res_df,use_container_width=True,hide_index=True)
                     else:
                         st.warning("No valid compounds processed.")
             except Exception as e:
@@ -826,46 +1113,53 @@ with tab3:
     if st.button("Run Selectivity Profile", key="run_sel", type="primary"):
         smi = sel_smi.strip()
         seqs_raw = sel_seqs.strip()
-        if not smi: st.error("Please enter a SMILES string.")
-        elif not seqs_raw: st.error("Please enter at least one sequence.")
         else:
             seqs_list = [clean_fasta(s) for s in seqs_raw.split("\n")
                          if s.strip() and not s.strip().startswith(">")][:10]
-            lig,err = ligand_features(smi)
-            if err: st.error(f"Ligand error: {err}")
             else:
                 results = []
                 for seq in seqs_list:
                     with st.spinner(f"Processing target {len(results)+1}/{len(seqs_list)}..."):
                         try:
                             esm_mean = embed_sequence(seq)
-                            seqfeat  = seq_features(seq)
-                            X = assemble(esm_mean,seqfeat,lig,lig_scaler)
-                            pkd,ci_lo,ci_hi = predict_pkd(X,fold_models,meta,iso_cal,target_mu,target_std)
                             if pkd is None:
-                                import random; random.seed(hash(seq[:20])%2**31)
-                                pkd=random.uniform(4.5,9.0); ci_lo=pkd-0.8; ci_hi=pkd+0.8
-                            in_domain,_ = check_ad(esm_mean,train_embs,ad_threshold)
-                            results.append({"seq":seq,"pkd":pkd,"ci_lo":ci_lo,
-                                            "ci_hi":ci_hi,"ki":pkd_to_ki(pkd),
-                                            "in_domain":in_domain})
-                        except: continue
                 if results:
-                    results.sort(key=lambda r:r["pkd"],reverse=True)
                     st.markdown("<hr>", unsafe_allow_html=True)
                     st.markdown("""<div style="font-family:'Source Serif 4',serif;
                                     font-size:18px;font-weight:600;color:#DCE8F8;margin-bottom:14px">
                                     Selectivity profile</div>""", unsafe_allow_html=True)
-                    palette=["#C9933A","#2ABFB3","#8B5CF6","#E05252","#34D399"]
                     scols = st.columns(2)
-                    for i,r in enumerate(results):
-                        ca=palette[i%len(palette)]
-                        with scols[i%2]:
                             if r["in_domain"]:
-                                ad_txt=f'<span style="background:rgba(42,191,179,0.12);color:#2ABFB3;border:1px solid rgba(42,191,179,0.22);border-radius:4px;padding:2px 7px;font-size:10px">In domain</span>'
                             else:
-                                ad_txt=f'<span style="background:rgba(224,82,82,0.10);color:#E05252;border:1px solid rgba(224,82,82,0.22);border-radius:4px;padding:2px 7px;font-size:10px">Out of domain</span>'
                             st.markdown(f"""
                             <div style="background:#162540;border:1px solid #1C2E48;border-radius:8px;
                                         padding:13px 16px;display:flex;align-items:center;gap:13px;

 import warnings
 warnings.filterwarnings("ignore")
+import os
+import time
+import base64
 from pathlib import Path
 from io import BytesIO
+from typing import Any, Dict, Optional, Tuple, List
 import numpy as np
 import pandas as pd
 import torch
 import streamlit as st
+# optional rdkit logging mute
 try:
     from rdkit import RDLogger
     RDLogger.DisableLog("rdApp.*")
+except Exception:
     pass
+# light-weight logging for debugging (doesn't print unless configured)
+import logging
+logger = logging.getLogger("velobind")
+# logger.setLevel(logging.INFO)  # enable if needed for debugging
 # ─── Page config ────────────────────────────────────────────────
 st.set_page_config(
     page_title="VeloBind",
 </style>
 """, unsafe_allow_html=True)
+# ─── Constants / paths ─────────────────────────────────────────
+MODEL_REPO = "ym59/velobind-models"
+MODEL_DIR = Path("output/models")
+PREP_DIR = Path("output/preprocessors")
+AD_EMB_PATH = Path("output/ad_train_embeddings.npy")
+# Attempt to load descriptor functions once to avoid repeated sorting
+_DESC_FNS: Optional[List[Any]] = None
+try:
+    from rdkit.Chem import Descriptors
+    _DESC_FNS = [v for k, v in sorted(Descriptors.descList)][:217]
+except Exception:
+    _DESC_FNS = None
 # ─── Model loading ───────────────────────────────────────────────
 @st.cache_resource(show_spinner=False)
+def load_models() -> Tuple[Dict[str, Any], Optional[Any], Optional[Any], Optional[Any], Optional[np.ndarray], float, float, float]:
+    """
+    Load the ensemble fold models, metadata calibrator, isotonic calibrator,
+    ligand scaler, AD embeddings and target scaler statistics.
+    Returns the same tuple structure as the original implementation.
+    """
     try:
         import joblib
+        fold_models: Dict[str, Any] = {}
         meta = iso_cal = lig_scaler = None
         train_embs = None
         ad_threshold = 1.4
         target_mu, target_std = 6.361, 1.855
+        # Ensure model directory exists: if not, attempt to download snapshot
+        if not MODEL_DIR.exists() or not any(MODEL_DIR.glob("*.pkl")):
+            try:
+                from huggingface_hub import snapshot_download
+                snapshot_download(repo_id=MODEL_REPO, repo_type="dataset", local_dir=".")
+            except Exception as e:
+                # fallback: continue, perhaps running a reduced local demo
+                logger.debug("snapshot_download failed: %s", e)
+        # Load models if present
         if MODEL_DIR.exists():
+            seeds = [42, 123, 456]
+            n_folds = 5
+            mtypes = ["lgbm", "cb", "xgb"]
             for seed in seeds:
                 for mt in mtypes:
                     for fold in range(n_folds):
+                        key = f"s{seed}_{mt}_f{fold}"
+                        p = MODEL_DIR / f"fold_model_{key}.pkl"
+                        if p.exists():
+                            try:
+                                fold_models[key] = joblib.load(p)
+                            except Exception:
+                                logger.debug("Failed to load %s", p)
+            # optional meta and isotonic calibrator
+            for fname, attr in [("meta_all_casf16.pkl", "meta"), ("isotonic_calibrator.pkl", "iso")]:
                 p = MODEL_DIR / fname
                 if p.exists():
+                    try:
+                        obj = joblib.load(p)
+                        if attr == "meta":
+                            meta = obj
+                        else:
+                            iso_cal = obj
+                    except Exception:
+                        logger.debug("Failed to load %s", p)
             ts = MODEL_DIR / "target_scaler.pkl"
             if ts.exists():
+                try:
+                    t = joblib.load(ts)
+                    # many scalers store attributes differently; handle common ones
+                    if hasattr(t, "mu") and hasattr(t, "std"):
+                        target_mu = float(t.mu)
+                        target_std = float(t.std)
+                    elif hasattr(t, "mean_") and hasattr(t, "scale_"):
+                        target_mu = float(t.mean_)
+                        target_std = float(t.scale_)
+                except Exception:
+                    logger.debug("Failed to read target scaler %s", ts)
+        # load ligand scaler if present
         if PREP_DIR.exists():
+            ls = PREP_DIR / "ligand_scaler.pkl"
+            if ls.exists():
+                try:
+                    import joblib as _job
+                    lig_scaler = _job.load(ls)
+                except Exception:
+                    logger.debug("Failed to load ligand scaler %s", ls)
+        # applicability domain embeddings
+        if AD_EMB_PATH.exists():
+            try:
+                train_embs = np.load(str(AD_EMB_PATH))
+                at = Path("output/ad_threshold.npy")
+                if at.exists():
+                    ad_threshold = float(np.load(str(at)))
+            except Exception:
+                logger.debug("Failed to load AD embeddings")
+        return fold_models, meta, iso_cal, lig_scaler, train_embs, ad_threshold, target_mu, target_std
     except Exception as e:
+        logger.debug("load_models top-level exception: %s", e)
+        return {}, None, None, None, None, 1.4, 6.361, 1.855
 @st.cache_resource(show_spinner=False)
 def load_esm():
+    """
+    Load ESM tokenizer and model. Kept identical to original but wrapped.
+    """
     from transformers import AutoTokenizer, EsmModel
+    tok = AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
+    model = EsmModel.from_pretrained("facebook/esm2_t12_35M_UR50D")
+    model.eval()
+    return tok, model
 @st.cache_data(show_spinner=False)
+def embed_sequence(seq: str) -> np.ndarray:
+    """
+    Embed a protein sequence using ESM-2 and return concatenated mean vectors.
+    This retains original chunking behavior but is slightly more robust to
+    tokenizer/model changes.
+    """
     tok, model = load_esm()
     MAX, HALF = 1022, 511
+    def _chunk(s: str) -> np.ndarray:
+        enc = tok(s, return_tensors="pt", truncation=False)
         with torch.no_grad():
+            out = model(**enc, output_hidden_states=True)
+        hs = out.hidden_states
+        mask = enc["attention_mask"].unsqueeze(-1).float()
+        mvecs = []
+        for li in [8, 10, 11]:
+            h = hs[li]
+            mv = (h * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
+            mvecs.append(mv.squeeze(0).cpu().numpy())
         return np.concatenate(mvecs)
+    seq = seq.strip()
+    if len(seq) <= MAX:
+        return _chunk(seq)
+    # preserve original behavior of averaging first/last halves
+    return (_chunk(seq[:HALF]) + _chunk(seq[-HALF:])) / 2.0
+def seq_features(seq: str) -> np.ndarray:
+    """
+    Compute a vector of protein sequence features. Tries Bio.SeqUtils.ProtParam,
+    then falls back to placeholder zeros if something fails.
+    Also computes dipeptide frequencies and attempts to append additional
+    custom features from src.features.protein if available.
+    """
+    seq = seq.strip().upper()
     try:
         from Bio.SeqUtils.ProtParam import ProteinAnalysis
+        pa = ProteinAnalysis(seq)
+        pp = [
+            pa.molecular_weight(),
+            pa.aromaticity(),
+            pa.instability_index(),
+            pa.isoelectric_point(),
+            pa.gravy(),
+            *pa.secondary_structure_fraction(),
+            *list(pa.amino_acids_percent.values()),
+        ]
+    except Exception:
+        # fallback: maintain same length (28) as prior
+        pp = [0.0] * 28
+    # dipeptide frequency (400 features for 20x20)
+    AA = list("ACDEFGHIKLMNPQRSTVWY")
+    dp = {a + b: 0 for a in AA for b in AA}
+    for i in range(len(seq) - 1):
+        k = seq[i].upper() + seq[i + 1].upper()
+        if k in dp:
+            dp[k] += 1
+    tot = max(1, sum(dp.values()))
+    dpc = [v / tot for v in dp.values()]
+    # optional extra features from project
     try:
+        from src.features.protein import _ctd, _conjoint_triad, _qso, _aaindex_encoding
+        extra = list(_ctd(seq)) + list(_conjoint_triad(seq)) + list(_qso(seq)) + list(_aaindex_encoding(seq))
+    except Exception:
+        extra = [0.0] * (63 + 343 + 60 + 25)
+    return np.array(pp + dpc + extra, dtype=np.float32)
+def ligand_features(smiles: str) -> Tuple[Optional[Dict[str, np.ndarray]], Optional[str]]:
+    """
+    Generate ligand fingerprints and descriptors from a SMILES string using RDKit.
+    Returns tuple (feature_dict, error_message). On success, error_message is None.
+    """
     try:
         from rdkit import Chem
+        from rdkit.Chem import AllChem, MACCSkeys, Descriptors, DataStructs
+        from rdkit.Chem.rdMolDescriptors import (
+            GetHashedAtomPairFingerprint,
+            GetHashedTopologicalTorsionFingerprint,
+        )
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None, "Invalid SMILES"
+        def fp(obj, n):
+            a = np.zeros(n, dtype=np.float32)
+            DataStructs.ConvertToNumpyArray(obj, a)
+            return a
+        # ECFP variants (bit vectors)
+        ecfp2 = fp(AllChem.GetMorganFingerprintAsBitVect(mol, 1, 1024), 1024)
+        ecfp4 = fp(AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024), 1024)
+        ecfp6 = fp(AllChem.GetMorganFingerprintAsBitVect(mol, 3, 1024), 1024)
+        fcfp4 = fp(AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024, useFeatures=True), 1024)
+        maccs = fp(MACCSkeys.GenMACCSKeys(mol), 167)
+        ap = np.zeros(2048, dtype=np.float32)
+        DataStructs.ConvertToNumpyArray(GetHashedAtomPairFingerprint(mol, 2048), ap)
+        tors = np.zeros(2048, dtype=np.float32)
+        DataStructs.ConvertToNumpyArray(GetHashedTopologicalTorsionFingerprint(mol, 2048), tors)
+        # E-state fingerprint (79) if available
         try:
             from rdkit.Chem.EState.Fingerprinter import FingerprintMol
+            es = np.nan_to_num(np.clip(FingerprintMol(mol)[0].astype(np.float32), -1e6, 1e6))[:79]
+            if len(es) < 79:
+                es = np.pad(es, (0, 79 - len(es)))
+        except Exception:
+            es = np.zeros(79, dtype=np.float32)
+        # physico-chemical descriptors: attempt to use precomputed list
+        phys = []
+        desc_fns = _DESC_FNS
+        if desc_fns is None:
+            desc_fns = [v for k, v in sorted(Descriptors.descList)][:217]
         for fn in desc_fns:
             try:
+                v = float(fn(mol))
+                if not np.isfinite(v) or abs(v) > 1e10:
+                    phys.append(0.0)
+                else:
+                    phys.append(v)
+            except Exception:
+                phys.append(0.0)
+        return {
+            "ecfp2": ecfp2,
+            "ecfp": ecfp4,
+            "ecfp6": ecfp6,
+            "fcfp": fcfp4,
+            "maccs": maccs,
+            "ap": ap,
+            "torsion": tors,
+            "estate": es,
+            "phys": np.array(phys, dtype=np.float64),
+        }, None
+    except Exception as e:
+        logger.debug("ligand_features error: %s", e)
+        return None, str(e)
+def assemble(esm_mean: np.ndarray, seqfeat: np.ndarray, lig: Dict[str, np.ndarray], lig_scaler: Any) -> np.ndarray:
+    """
+    Combine ESM embedding tail, sequence features and ligand features into a single input vector.
+    Preserves original ordering and composition.
+    """
+    # use last 480 dims from esm_mean (same as original)
+    esm_last = esm_mean[-480:]
     if lig_scaler is not None:
         try:
+            combined = np.concatenate([lig["estate"], lig["phys"]])
+            combined = lig_scaler.transform(combined.reshape(1, -1)).ravel()
+            es = combined[:79].astype(np.float32)
+            ph = combined[79:].astype(np.float32)
+        except Exception:
+            es, ph = lig["estate"], lig["phys"].astype(np.float32)
+    else:
+        es, ph = lig["estate"], lig["phys"].astype(np.float32)
+    out = np.concatenate(
+        [
+            esm_last,
+            seqfeat,
+            lig["ecfp"],
+            lig["ecfp2"],
+            lig["ecfp6"],
+            lig["fcfp"],
+            es,
+            lig["maccs"],
+            lig["ap"],
+            lig["torsion"],
+            ph,
+        ]
+    ).astype(np.float32)
+    return out
+def predict_pkd(X: np.ndarray, fold_models: Dict[str, Any], meta: Any, iso_cal: Any, target_mu: float, target_std: float
+                ) -> Tuple[Optional[float], Optional[float], Optional[float]]:
+    """
+    Predict pKd using ensemble fold_models + meta model + optional isotonic calibration.
+    Returns (pred, ci_lo, ci_hi). If no models are loaded, returns (None, None, None)
+    """
+    if not fold_models:
+        return None, None, None
+    seeds, n_folds, mtypes = [42, 123, 456], 5, ["lgbm", "cb", "xgb"]
+    mat = np.zeros((1, len(seeds) * len(mtypes)))
+    col = 0
     for seed in seeds:
         for mt in mtypes:
+            preds = []
+            for f in range(n_folds):
+                key = f"s{seed}_{mt}_f{f}"
+                if key in fold_models:
+                    try:
+                        preds.append(fold_models[key].predict(X.reshape(1, -1))[0])
+                    except Exception:
+                        logger.debug("predict failed for %s", key)
+            if preds:
+                # convert to target scale
+                mat[0, col] = np.mean(preds) * target_std + target_mu
+            col += 1
+    # if meta exists, use it, else mean of non-zero entries
+    nonzero = mat[mat != 0]
+    if meta is not None:
+        try:
+            pred = float(meta.predict(mat)[0])
+        except Exception:
+            pred = float(np.mean(nonzero)) if nonzero.size else float(mat.mean())
+    else:
+        pred = float(np.mean(nonzero)) if nonzero.size else float(mat.mean())
+    # isotonic calibrator if available
+    if iso_cal is not None:
+        try:
+            pred = float(iso_cal.predict([pred])[0])
+        except Exception:
+            logger.debug("isotonic calibration failed")
+    nz = nonzero
+    spread = float(nz.std()) if nz.size > 1 else 0.5
+    return pred, pred - 1.96 * spread, pred + 1.96 * spread
+def check_ad(esm_mean: np.ndarray, train_embs: Optional[np.ndarray], ad_threshold: float) -> Tuple[bool, float]:
+    """
+    Check applicability domain using cosine distances to a subset of train embeddings.
+    Returns (in_domain_bool, distance_value).
+    """
+    if train_embs is None:
+        return True, 0.0
+    try:
+        from sklearn.metrics.pairwise import cosine_distances
+        q = esm_mean[-480:].reshape(1, -1)
+        # guard: use at most first 2000 embeddings for speed
+        d = cosine_distances(q, train_embs[:2000])[0]
+        k = float(np.sort(d)[:5].mean())
+        return k <= ad_threshold, k
+    except Exception as e:
+        logger.debug("check_ad error: %s", e)
+        return True, 0.0
+def clean_fasta(s: str) -> str:
+    s = s.strip()
     if s.startswith(">"):
         return "".join(l.strip() for l in s.split("\n") if not l.startswith(">"))
+    return s.replace(" ", "").replace("\n", "")
+def pkd_to_ki(pkd: float) -> str:
+    """
+    Convert pKd to Ki string with appropriate unit formatting without changing original logic.
+    """
+    m = 10 ** (-pkd)
+    if m < 1e-9:
+        return f"{m * 1e12:.1f} pM"
+    if m < 1e-6:
+        return f"{m * 1e9:.1f} nM"
+    if m < 1e-3:
+        return f"{m * 1e6:.1f} uM"
+    return f"{m * 1e3:.1f} mM"
+def xai_chart(smiles: str, pkd: float):
+    """
+    Build the feature-attribution waterfall-like chart. Returns matplotlib Figure or None.
+    """
     try:
+        from rdkit import Chem
+        from rdkit.Chem import Descriptors
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None
+        features = {
+            "MW / atom count": +0.12 * min((mol.GetNumHeavyAtoms() - 25) / 20, 1.0),
+            "LogP (hydrophobicity)": +0.18 * min((Descriptors.MolLogP(mol) - 2) / 3, 1.0),
+            "H-bond donors": -0.09 * max(Descriptors.NumHDonors(mol) - 2, 0),
+            "H-bond acceptors": +0.11 * min(Descriptors.NumHAcceptors(mol) / 5, 1.0),
+            "TPSA (polarity)": -0.10 * max((Descriptors.TPSA(mol) - 70) / 50, 0),
+            "Aromatic rings": +0.15 * min(Descriptors.NumAromaticRings(mol) / 3, 1.0),
+            "Rotatable bonds": -0.07 * max((Descriptors.NumRotatableBonds(mol) - 5) / 5, 0),
+            "ESM-2 protein repr": (pkd - 6.36) * 0.4,
         }
+        items = sorted(features.items(), key=lambda x: abs(x[1]), reverse=True)[:8]
+        labels = [i[0] for i in items]
+        values = [i[1] for i in items]
+        baseline = 6.36
+        running = baseline
+        lefts, widths, colors, rvals = [], [], [], []
         for v in values:
+            lefts.append(min(running, running + v))
+            widths.append(abs(v))
+            colors.append("#C9933A" if v >= 0 else "#E05252")
+            running += v
+            rvals.append(running)
+        fig, ax = plt.subplots(figsize=(7.2, 3.8))
+        fig.patch.set_facecolor("#0D1627")
+        ax.set_facecolor("#0D1627")
+        ax.barh(range(len(labels)), widths, left=lefts, color=colors, height=0.50, alpha=0.90, edgecolor="none")
+        ax.axvline(baseline, color="#243858", lw=1.1, ls="--", alpha=0.9)
+        ax.axvline(pkd, color="#C9933A", lw=1.5, ls="-", alpha=0.9)
+        for i, (rv, v) in enumerate(zip(rvals, values)):
+            sign = "+" if v >= 0 else ""
+            ax.text(rv + 0.012 * (1 if v >= 0 else -1), i, f"{sign}{v:.2f}", va="center",
+                    ha="left" if v >= 0 else "right", fontsize=8.5, color="#B8CCE0", fontfamily="monospace")
+        ax.set_yticks(range(len(labels)))
+        ax.set_yticklabels(labels, fontsize=9, color="#7A9ABE")
+        ax.set_xlabel("pKd contribution", fontsize=9, color="#3D5878", labelpad=7)
+        ax.tick_params(axis="x", colors="#243858", labelsize=8.5, labelcolor="#7A9ABE")
+        ax.tick_params(axis="y", length=0)
+        for sp in ax.spines.values():
+            sp.set_visible(False)
+        ax.grid(axis="x", color="#162035", lw=0.7, alpha=0.9)
+        pos_p = mpatches.Patch(color="#C9933A", label="Increases pKd")
+        neg_p = mpatches.Patch(color="#E05252", label="Decreases pKd")
+        ax.legend(handles=[pos_p, neg_p], loc="lower right", fontsize=8,
+                  facecolor="#0D1627", edgecolor="#1C2E48", labelcolor="#7A9ABE", framealpha=0.95)
+        ax.text(pkd, -0.9, f"  pKd = {pkd:.2f}", color="#C9933A", fontsize=8.5, va="top", fontfamily="monospace")
+        ax.text(baseline, -0.9, f"  base = {baseline:.2f}", color="#3D5878", fontsize=8, va="top", fontfamily="monospace")
+        plt.tight_layout(pad=0.6)
+        return fig
+    except Exception as e:
+        logger.debug("xai_chart error: %s", e)
+        return None
+# ─── HTML helpers ───────────────────────────────────────────────
+def metric_card(label: str, value: str, accent: bool = False):
+    border = "rgba(201,147,58,0.35)" if accent else "#1C2E48"
+    bg = "linear-gradient(135deg,#111E33 0%,rgba(201,147,58,0.04) 100%)" if accent else "#111E33"
+    vc = "#C9933A" if accent else "#DCE8F8"
     return st.markdown(f"""
     <div style="background:{bg};border:1px solid {border};border-radius:8px;
                 padding:17px 14px;text-align:center;box-shadow:0 1px 5px rgba(0,0,0,0.4)">
                   font-family:'Outfit',sans-serif">{label}</div>
     </div>""", unsafe_allow_html=True)
+def ad_badge(in_domain: bool, dist: float):
     if in_domain:
+        c, bc = "#2ABFB3", "rgba(42,191,179,0.12)"
+        bc2 = "rgba(42,191,179,0.22)"
+        txt = "IN DOMAIN"
     else:
+        c, bc = "#E05252", "rgba(224,82,82,0.10)"
+        bc2 = "rgba(224,82,82,0.22)"
+        txt = "OUT OF DOMAIN"
     return st.markdown(f"""
     <div style="background:#111E33;border:1px solid #1C2E48;border-radius:8px;
                 padding:17px 14px;text-align:center;box-shadow:0 1px 5px rgba(0,0,0,0.4)">
                   font-family:'Outfit',sans-serif;margin-top:5px">Applicability domain</div>
     </div>""", unsafe_allow_html=True)
+def card_wrap(content_fn, head: str, sub: Optional[str] = None):
     st.markdown(f"""
     <div style="background:#111E33;border:1px solid #1C2E48;border-radius:8px;
                 padding:16px 18px 4px 18px;box-shadow:0 1px 4px rgba(0,0,0,0.4)">
         {f'<span style="font-family:monospace;font-size:10px;color:#3D5878;margin-left:6px;font-weight:400">{sub}</span>' if sub else ''}
       </div>
     </div>""", unsafe_allow_html=True)
     st.markdown("""<div style="background:#111E33;border:1px solid #1C2E48;border-top:none;
                     border-radius:0 0 8px 8px;padding:0 18px 16px;margin-top:-4px;
                     box-shadow:0 2px 6px rgba(0,0,0,0.3)">""", unsafe_allow_html=True)
     content_fn()
     st.markdown("</div>", unsafe_allow_html=True)
+# ─── Example data ───────────────────────────────────────────────
 SEQS = {
     "EGFR kinase": "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACIDRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPSRDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGSTAENAEYLRVAPQSSEFIGA",
+    "HIV protease": "PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF",
+    "Thrombin": "MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANTFLEEVRKGNLERECVEETCSYEEAFEALESSTATDVFWAKYTACETARTPRDKLAACLEGNCAEGLGTNYRGHVNITRSGIECQLWRSRYPHKPEINSTTHPGADLQENFCRNPDSSTTGPWCYTTDPTVRRQECSIPVCGQDQVTVAMTPRSEGSSVNLSPPLEQCVPDRGQQYQLRPVQPFLNQLREIFNMAR",
 }
 SMIS = {
     "Erlotinib": "CCOc1cc2c(cc1OCC)ncnc2Nc1cccc(Cl)c1",
+    "Imatinib": "Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc(-c2cccnc2)n1",
     "Indinavir": "OC[C@@H](NC(=O)[C@@H]1CN(Cc2cccnc2)C[C@H]1NC(=O)[C@@H](CC(C)C)NC(=O)c1cc2ccccc2[nH]1)Cc1ccccc1",
 }
 # ─── Init session state ───────────────────────────────────────────
+for k, v in [("seq_val", ""), ("smi_val", ""), ("bseq_val", ""),
+             ("ssel_val", ""), ("sseqs_val", "")]:
     if k not in st.session_state:
         st.session_state[k] = v
 # ─── Load models ─────────────────────────────────────────────────
 with st.spinner("Loading VeloBind models..."):
+    fold_models, meta, iso_cal, lig_scaler, train_embs, ad_threshold, target_mu, target_std = load_models()
 n_loaded = len(fold_models)
 # ─── HEADER ──────────────────────────────────────────────────────
 # ─── PAGE TITLE ───────────────────────────────────────────────────
 col_logo, col_title = st.columns([1, 11], gap="small")
 with col_logo:
+    try:
+        st.image("static/logo.png", width=72)
+    except Exception:
+        pass
 with col_title:
     st.markdown("""
     <div style="padding-top:4px">
 </div>
 """, unsafe_allow_html=True)
+# ─── TABS ───────────────────────────────────────────────────────
 tab1, tab2, tab3 = st.tabs(["Single Query", "Batch Screening", "Selectivity Profile"])
 # ════════════════ TAB 1: SINGLE ══════════════════════════════════
         st.markdown('<p style="font-size:10.5px;color:#3D5878;margin:8px 0 4px">Load example:</p>', unsafe_allow_html=True)
         ex_cols = st.columns(3)
+        for i, (name, seq) in enumerate(SEQS.items()):
             with ex_cols[i]:
                 st.markdown('<div class="pill-btn">', unsafe_allow_html=True)
                 if st.button(name, key=f"seq_ex_{i}"):
         st.markdown('<p style="font-size:10.5px;color:#3D5878;margin:8px 0 4px">Load example:</p>', unsafe_allow_html=True)
         sm_cols = st.columns(3)
+        for i, (name, smi) in enumerate(SMIS.items()):
             with sm_cols[i]:
                 st.markdown('<div class="pill-btn">', unsafe_allow_html=True)
                 if st.button(name, key=f"smi_ex_{i}"):
     if st.button("Predict Binding Affinity", key="run_single", type="primary"):
         seq = clean_fasta(seq_input)
         smi = smi_input.strip()
+        if not seq:
+            st.error("Please enter a protein sequence.")
+        elif not smi:
+            st.error("Please enter a SMILES string.")
         else:
             t0 = time.time()
             with st.spinner("Running ESM-2 embedding..."):
                 st.error(f"Ligand error: {err}")
             else:
                 with st.spinner("Running 45-model ensemble..."):
+                    X = assemble(esm_mean, seqfeat, lig, lig_scaler)
+                    pkd, ci_lo, ci_hi = predict_pkd(X, fold_models, meta, iso_cal, target_mu, target_std)
                 if pkd is None:
+                    import random
+                    random.seed(hash(seq[:20] + smi[:20]) % 2 ** 31)
+                    pkd = random.uniform(5.5, 9.0)
+                    ci_lo = pkd - 0.8
+                    ci_hi = pkd + 0.8
+                in_domain, ad_dist = check_ad(esm_mean, train_embs, ad_threshold)
+                elapsed = round(time.time() - t0, 1)
                 st.markdown("<hr>", unsafe_allow_html=True)
+                mc1, mc2, mc3, mc4 = st.columns(4)
+                with mc1:
+                    metric_card("Predicted pKd", f"{pkd:.2f}", accent=True)
+                with mc2:
+                    metric_card("95% model interval", f"[{ci_lo:.2f}, {ci_hi:.2f}]")
+                with mc3:
+                    metric_card("Estimated Ki", pkd_to_ki(pkd))
+                with mc4:
+                    ad_badge(in_domain, ad_dist)
                 st.markdown("""
                 <div style="background:#111E33;border:1px solid #1C2E48;border-radius:8px;
                       SHAP / LightGBM</span>
                   </div>
                 """, unsafe_allow_html=True)
+                fig = xai_chart(smi, pkd)
                 if fig:
+                    st.pyplot(fig, use_container_width=True)
+                    plt.close(fig)
                 st.markdown("</div>", unsafe_allow_html=True)
                 st.markdown(f"""
     if st.button("Run Batch Screening", key="run_batch", type="primary"):
         seq = clean_fasta(batch_seq)
+        if not seq:
+            st.error("Please enter a protein sequence.")
+        elif uploaded is None:
+            st.error("Please upload a CSV file.")
         else:
             try:
                 df = pd.read_csv(uploaded)
+                col = next((c for c in df.columns if c.lower() in ("smiles", "smile", "smi", "canonical_smiles")), None)
+                if col is None:
+                    st.error("No 'smiles' column found.")
                 else:
                     df = df.head(500)
+                    name_col = next((c for c in df.columns if c.lower() in ("name", "compound_name", "id", "molecule_name")), None)
                     with st.spinner("Embedding protein..."):
                         esm_mean = embed_sequence(seq)
+                        seqfeat = seq_features(seq)
+                        in_domain, _ = check_ad(esm_mean, train_embs, ad_threshold)
                     results = []
                     prog = st.progress(0, text="Screening...")
+                    total = len(df)
+                    for idx, row in df.iterrows():
+                        smi = str(row[col]).strip()
                         name = str(row[name_col]).strip() if name_col else ""
                         try:
+                            lig, err = ligand_features(smi)
+                            if err:
+                                continue
+                            X = assemble(esm_mean, seqfeat, lig, lig_scaler)
+                            pkd, ci_lo, ci_hi = predict_pkd(X, fold_models, meta, iso_cal, target_mu, target_std)
                             if pkd is None:
+                                import random
+                                random.seed(hash(smi) % 2 ** 31)
+                                pkd = random.uniform(5.0, 9.0)
+                                ci_lo = pkd - 0.8
+                                ci_hi = pkd + 0.8
+                            results.append({
+                                "Name": name,
+                                "SMILES": smi,
+                                "pKd": round(pkd, 3),
+                                "95% CI": f"[{ci_lo:.2f}, {ci_hi:.2f}]",
+                                "Ki": pkd_to_ki(pkd),
+                                "In_domain": in_domain
+                            })
+                        except Exception:
+                            continue
+                        # update progress more robustly
+                        prog.progress(min(int(len(results) / total * 100), 100),
+                                      text=f"{len(results)}/{total} compounds screened")
                     prog.empty()
                     if results:
+                        res_df = pd.DataFrame(results).sort_values("pKd", ascending=False)
+                        res_df.insert(0, "Rank", range(1, len(res_df) + 1))
                         st.markdown("<hr>", unsafe_allow_html=True)
+                        rh, rd = st.columns([5, 1])
                         with rh:
                             st.markdown(f"""<div style="font-family:'Source Serif 4',serif;
                                             font-size:18px;font-weight:600;color:#DCE8F8">
                                             font-family:monospace">({len(res_df)} compounds)</span>
                                             </div>""", unsafe_allow_html=True)
                         with rd:
+                            st.download_button("Download CSV", res_df.to_csv(index=False),
+                                               "velobind_results.csv", "text/csv")
+                        st.dataframe(res_df, use_container_width=True, hide_index=True)
                     else:
                         st.warning("No valid compounds processed.")
             except Exception as e:
     if st.button("Run Selectivity Profile", key="run_sel", type="primary"):
         smi = sel_smi.strip()
         seqs_raw = sel_seqs.strip()
+        if not smi:
+            st.error("Please enter a SMILES string.")
+        elif not seqs_raw:
+            st.error("Please enter at least one sequence.")
         else:
             seqs_list = [clean_fasta(s) for s in seqs_raw.split("\n")
                          if s.strip() and not s.strip().startswith(">")][:10]
+            lig, err = ligand_features(smi)
+            if err:
+                st.error(f"Ligand error: {err}")
             else:
                 results = []
                 for seq in seqs_list:
                     with st.spinner(f"Processing target {len(results)+1}/{len(seqs_list)}..."):
                         try:
                             esm_mean = embed_sequence(seq)
+                            seqfeat = seq_features(seq)
+                            X = assemble(esm_mean, seqfeat, lig, lig_scaler)
+                            pkd, ci_lo, ci_hi = predict_pkd(X, fold_models, meta, iso_cal, target_mu, target_std)
                             if pkd is None:
+                                import random
+                                random.seed(hash(seq[:20]) % 2 ** 31)
+                                pkd = random.uniform(4.5, 9.0)
+                                ci_lo = pkd - 0.8
+                                ci_hi = pkd + 0.8
+                            in_domain, _ = check_ad(esm_mean, train_embs, ad_threshold)
+                            results.append({"seq": seq, "pkd": pkd, "ci_lo": ci_lo,
+                                            "ci_hi": ci_hi, "ki": pkd_to_ki(pkd),
+                                            "in_domain": in_domain})
+                        except Exception:
+                            continue
                 if results:
+                    results.sort(key=lambda r: r["pkd"], reverse=True)
                     st.markdown("<hr>", unsafe_allow_html=True)
                     st.markdown("""<div style="font-family:'Source Serif 4',serif;
                                     font-size:18px;font-weight:600;color:#DCE8F8;margin-bottom:14px">
                                     Selectivity profile</div>""", unsafe_allow_html=True)
+                    palette = ["#C9933A", "#2ABFB3", "#8B5CF6", "#E05252", "#34D399"]
                     scols = st.columns(2)
+                    for i, r in enumerate(results):
+                        ca = palette[i % len(palette)]
+                        with scols[i % 2]:
                             if r["in_domain"]:
+                                ad_txt = f'<span style="background:rgba(42,191,179,0.12);color:#2ABFB3;border:1px solid rgba(42,191,179,0.22);border-radius:4px;padding:2px 7px;font-size:10px">In domain</span>'
                             else:
+                                ad_txt = f'<span style="background:rgba(224,82,82,0.10);color:#E05252;border:1px solid rgba(224,82,82,0.22);border-radius:4px;padding:2px 7px;font-size:10px">Out of domain</span>'
                             st.markdown(f"""
                             <div style="background:#162540;border:1px solid #1C2E48;border-radius:8px;
                                         padding:13px 16px;display:flex;align-items:center;gap:13px;