File size: 3,856 Bytes
baf3373 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#!/usr/bin/env python3
"""
extract_iptm_affinity_csv_all.py
Writes:
- out_dir/wt_iptm_affinity_all.csv
- out_dir/smiles_iptm_affinity_all.csv
Also prints:
- N
- Spearman rho (affinity vs iptm)
- Pearson r (affinity vs iptm)
"""
from pathlib import Path
import numpy as np
import pandas as pd
def corr_stats(df: pd.DataFrame, x: str, y: str):
# pandas handles NaNs if we already dropped them; still be safe
xx = pd.to_numeric(df[x], errors="coerce")
yy = pd.to_numeric(df[y], errors="coerce")
m = xx.notna() & yy.notna()
xx = xx[m]
yy = yy[m]
n = int(m.sum())
# Pearson r
pearson_r = float(xx.corr(yy, method="pearson")) if n > 1 else float("nan")
# Spearman rho
spearman_rho = float(xx.corr(yy, method="spearman")) if n > 1 else float("nan")
return {"n": n, "pearson_r": pearson_r, "spearman_rho": spearman_rho}
def clean_one(
in_csv: Path,
out_csv: Path,
iptm_col: str,
affinity_col: str = "affinity",
keep_cols=(),
):
df = pd.read_csv(in_csv)
# affinity + iptm must exist
need = [affinity_col, iptm_col]
missing = [c for c in need if c not in df.columns]
if missing:
raise ValueError(f"{in_csv} missing columns: {missing}. Found: {list(df.columns)}")
# coerce numeric
df[affinity_col] = pd.to_numeric(df[affinity_col], errors="coerce")
df[iptm_col] = pd.to_numeric(df[iptm_col], errors="coerce")
# drop NaNs in either
df = df.dropna(subset=[affinity_col, iptm_col]).reset_index(drop=True)
# output cols (standardize names)
out = pd.DataFrame({
"affinity": df[affinity_col].astype(float),
"iptm": df[iptm_col].astype(float),
})
# keep split if present (handy for coloring later, but not used for corr)
if "split" in df.columns:
out.insert(0, "split", df["split"].astype(str))
# optional extras for labeling/debug
for c in keep_cols:
if c in df.columns:
out[c] = df[c]
out_csv.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(out_csv, index=False)
stats = corr_stats(out, "iptm", "affinity")
print(f"[write] {out_csv}")
print(f" N={stats['n']} | Pearson r={stats['pearson_r']:.4f} | Spearman rho={stats['spearman_rho']:.4f}")
# also save stats json next to csv
stats_path = out_csv.with_suffix(".stats.json")
with open(stats_path, "w") as f:
import json
json.dump(
{
"input_csv": str(in_csv),
"output_csv": str(out_csv),
"iptm_col": iptm_col,
"affinity_col": affinity_col,
**stats,
},
f,
indent=2,
)
def main():
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("--wt_meta_csv", type=str, required=True)
ap.add_argument("--smiles_meta_csv", type=str, required=True)
ap.add_argument("--out_dir", type=str, required=True)
ap.add_argument("--wt_iptm_col", type=str, default="wt_iptm_score")
ap.add_argument("--smiles_iptm_col", type=str, default="smiles_iptm_score")
ap.add_argument("--affinity_col", type=str, default="affinity")
args = ap.parse_args()
out_dir = Path(args.out_dir)
clean_one(
Path(args.wt_meta_csv),
out_dir / "wt_iptm_affinity_all.csv",
iptm_col=args.wt_iptm_col,
affinity_col=args.affinity_col,
keep_cols=("seq1", "seq2", "Fasta2SMILES", "REACT_SMILES"),
)
clean_one(
Path(args.smiles_meta_csv),
out_dir / "smiles_iptm_affinity_all.csv",
iptm_col=args.smiles_iptm_col,
affinity_col=args.affinity_col,
keep_cols=("seq1", "seq2", "Fasta2SMILES", "REACT_SMILES", "smiles_sequence"),
)
print(f"\n[DONE] CSVs + stats JSONs in: {out_dir}")
if __name__ == "__main__":
main()
|