Spaces:

patonlab
/

cascade

Running

App Files Files Community

bobbypaton commited on Mar 27

Commit

da8eac4

1 Parent(s): 5a434bd

Fix hf_hub_download import; restore complete worker.py

Browse files

Files changed (1) hide show

worker.py +234 -2

worker.py CHANGED Viewed

@@ -1,3 +1,79 @@
 def _log_prediction():
     """Append one row to the existing patonlab/analytics data.csv.
     Format matches the alfabet log: space,timestamp
@@ -11,7 +87,6 @@ def _log_prediction():
         api = HfApi(token=_HF_TOKEN)
         timestamp = datetime.datetime.utcnow().isoformat()
-        # Download current CSV, append a row, re-upload
         with tempfile.TemporaryDirectory() as tmpdir:
             local_path = hf_hub_download(
                 repo_id=_ANALYTICS_REPO,
@@ -31,4 +106,161 @@ def _log_prediction():
                 commit_message=f"log: cascade prediction {timestamp[:10]}",
             )
     except Exception as e:
-        print(f"Analytics logging failed (non-fatal): {e}", flush=True)

+"""
+CASCADE worker process
+"""
+import os
+import sys
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+import json
+import math
+import pickle
+import datetime
+from io import StringIO
+import redis
+import numpy as np
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+import keras
+from keras.models import load_model
+from NMR_Prediction.apply import (
+    preprocess_C, preprocess_H,
+    evaluate_C, evaluate_H,
+    RBFSequence,
+)
+from nfp.layers import (
+    MessageLayer, GRUStep, Squeeze, EdgeNetwork,
+    ReduceBondToPro, ReduceBondToAtom,
+    GatherAtomToBond, ReduceAtomToPro,
+)
+from nfp.models import GraphModel
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit.Chem import SDWriter
+from NMR_Prediction.genConf import genConf
+MODEL_PATH_C = os.path.join("NMR_Prediction", "schnet_edgeupdate",   "best_model.hdf5")
+MODEL_PATH_H = os.path.join("NMR_Prediction", "schnet_edgeupdate_H", "best_model.hdf5")
+PREPROCESSOR_PATH = os.path.join("NMR_Prediction", "preprocessor.p")
+custom_objects = {
+    "MessageLayer": MessageLayer,
+    "GRUStep": GRUStep,
+    "Squeeze": Squeeze,
+    "EdgeNetwork": EdgeNetwork,
+    "ReduceBondToPro": ReduceBondToPro,
+    "ReduceBondToAtom": ReduceBondToAtom,
+    "GatherAtomToBond": GatherAtomToBond,
+    "ReduceAtomToPro": ReduceAtomToPro,
+    "GraphModel": GraphModel,
+}
+print("Loading 13C model...", flush=True)
+model_C = load_model(MODEL_PATH_C, custom_objects=custom_objects)
+print("Loading 1H model...", flush=True)
+model_H = load_model(MODEL_PATH_H, custom_objects=custom_objects)
+print("Both models loaded.", flush=True)
+with open(PREPROCESSOR_PATH, "rb") as f:
+    preprocessor = pickle.load(f)["preprocessor"]
+redis_client = redis.StrictRedis(
+    host="localhost", port=6379, db=0, decode_responses=True
+)
+# ── Analytics logging to HF Dataset ──────────────────────────────────────────
+_HF_TOKEN       = os.environ.get("HF_TOKEN", "")
+_ANALYTICS_REPO = "patonlab/analytics"
+_ANALYTICS_FILE = "data.csv"
 def _log_prediction():
     """Append one row to the existing patonlab/analytics data.csv.
     Format matches the alfabet log: space,timestamp
         api = HfApi(token=_HF_TOKEN)
         timestamp = datetime.datetime.utcnow().isoformat()
         with tempfile.TemporaryDirectory() as tmpdir:
             local_path = hf_hub_download(
                 repo_id=_ANALYTICS_REPO,
                 commit_message=f"log: cascade prediction {timestamp[:10]}",
             )
     except Exception as e:
+        print(f"Analytics logging failed (non-fatal): {e}", flush=True)
+def _mol_to_sdf(mol, conf_id=0):
+    sio = StringIO()
+    w = SDWriter(sio)
+    w.write(mol, confId=conf_id)
+    w.close()
+    return sio.getvalue()
+def _build_sdfs_from_genconf(mol_with_confs, ids):
+    sdfs = []
+    energy_order = []
+    for energy, conf_id in ids:
+        try:
+            sdf = _mol_to_sdf(mol_with_confs, conf_id=int(conf_id))
+            if sdf.strip():
+                sdfs.append(sdf)
+                energy_order.append(int(conf_id))
+        except Exception as e:
+            print(f"SDF error for conf_id={conf_id}: {e}", flush=True)
+    return sdfs, energy_order
+def _boltzmann_average(spread_df):
+    spread_df["b_weight"] = spread_df["relative_E"].apply(
+        lambda x: math.exp(-x / (0.001987 * 298.15))
+    )
+    df_group = spread_df.set_index(["mol_id", "atom_index", "cf_id"]).groupby(level=[0, 1])
+    final = []
+    for (m_id, a_id), df in df_group:
+        ws = (df["b_weight"] * df["predicted"]).sum() / df["b_weight"].sum()
+        final.append([m_id, a_id, ws])
+    final = pd.DataFrame(final, columns=["mol_id", "atom_index", "Shift"])
+    final["atom_index"] = final["atom_index"].apply(lambda x: x + 1)
+    return final.round(2).astype(dtype={"atom_index": "int"})
+def _fmt_weighted(final_df):
+    return "".join(f"{int(r['atom_index'])},{r['Shift']:.2f};" for _, r in final_df.iterrows())
+def _fmt_conf_shifts(spread_df, energy_order):
+    parts = []
+    for cf_id in energy_order:
+        sub = spread_df[spread_df["cf_id"] == cf_id]
+        if len(sub) == 0:
+            continue
+        parts.append("".join(f"{int(r['atom_index'])},{r['predicted']:.2f};" for _, r in sub.iterrows()))
+    return "!".join(parts)
+def _fmt_relative_E(spread_df, energy_order):
+    total_bw = spread_df.groupby("cf_id")["b_weight"].first().sum()
+    parts = []
+    for cf_id in energy_order:
+        sub = spread_df[spread_df["cf_id"] == cf_id]
+        if len(sub) == 0:
+            continue
+        e = round(sub["relative_E"].iloc[0], 2)
+        bw = round(sub["b_weight"].iloc[0] / total_bw, 4)
+        parts.append(f"{e},{bw},")
+    return "!".join(parts)
+def run_job(task_id, smiles, type_):
+    result_key = f"task_result_{task_id}"
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        AllChem.EmbedMolecule(mol, useRandomCoords=True)
+        mol_with_h = Chem.AddHs(mol, addCoords=True)
+        mol_with_confs, ids, nr = genConf(mol_with_h, rms=-1, nc=200, efilter=10.0, rmspost=0.5)
+        print(f"genConf: {len(ids)} conformers", flush=True)
+        conf_sdfs, energy_order = _build_sdfs_from_genconf(mol_with_confs, ids)
+        mols = [Chem.MolFromSmiles(smiles)]
+        for m in mols:
+            AllChem.EmbedMolecule(m, useRandomCoords=True)
+        mols = [Chem.AddHs(m, addCoords=True) for m in mols]
+        # Suppress duplicate genConf stdout during preprocess
+        _stdout, _stderr = sys.stdout, sys.stderr
+        sys.stdout = sys.stderr = open(os.devnull, 'w')
+        try:
+            if type_ == "C":
+                inputs, df, conf_mols = preprocess_C(mols, preprocessor, keep_all_cf=True)
+            else:
+                inputs, df, conf_mols = preprocess_H(mols, preprocessor, keep_all_cf=True)
+        finally:
+            sys.stdout.close()
+            sys.stdout, sys.stderr = _stdout, _stderr
+        if type_ == "C":
+            predicted = evaluate_C(inputs, preprocessor, model_C)
+        else:
+            predicted = evaluate_H(inputs, preprocessor, model_H)
+        if len(inputs) == 0:
+            raise RuntimeError("No conformers generated")
+        spread_df = pd.DataFrame(columns=["mol_id", "atom_index", "relative_E", "cf_id"])
+        for _, r in df.iterrows():
+            n = len(r["atom_index"])
+            tmp = pd.DataFrame({
+                "mol_id": [r["mol_id"]] * n,
+                "atom_index": r["atom_index"],
+                "relative_E": [r["relative_E"]] * n,
+                "cf_id": [r["cf_id"]] * n,
+            })
+            spread_df = pd.concat([spread_df, tmp], sort=True)
+        spread_df["predicted"] = predicted
+        spread_df["b_weight"] = spread_df["relative_E"].apply(
+            lambda x: math.exp(-x / (0.001987 * 298.15))
+        )
+        spread_df["atom_index"] = spread_df["atom_index"].apply(lambda x: x + 1)
+        spread_df = spread_df.round(2)
+        final_df = _boltzmann_average(
+            spread_df.copy().assign(
+                atom_index=spread_df["atom_index"].apply(lambda x: x - 1)
+            )
+        )
+        result = {
+            "smiles": smiles,
+            "type_": type_,
+            "conf_sdfs": conf_sdfs,
+            "weightedShiftTxt": _fmt_weighted(final_df),
+            "confShiftTxt": _fmt_conf_shifts(spread_df, energy_order),
+            "relative_E": _fmt_relative_E(spread_df, energy_order),
+        }
+        redis_client.set(result_key, json.dumps(result), ex=3600)
+        print(f"Task {task_id} complete — {len(conf_sdfs)} conformers", flush=True)
+        # Log to analytics dataset (non-blocking)
+        _log_prediction()
+    except Exception as e:
+        import traceback; traceback.print_exc()
+        redis_client.set(result_key, json.dumps({"errMessage": str(e)}), ex=3600)
+print("Worker ready, waiting for jobs...", flush=True)
+while True:
+    item = redis_client.blpop("task_queue", timeout=5)
+    if item is None:
+        continue
+    _, task_id = item
+    detail = redis_client.get(f"task_detail_{task_id}")
+    if not detail:
+        continue
+    detail = json.loads(detail)
+    print(f"Processing task {task_id}  smiles={detail['smiles']}  type={detail['type_']}", flush=True)
+    run_job(task_id, detail["smiles"], detail["type_"])