Joblib
File size: 3,856 Bytes
baf3373
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
"""
extract_iptm_affinity_csv_all.py

Writes:
  - out_dir/wt_iptm_affinity_all.csv
  - out_dir/smiles_iptm_affinity_all.csv

Also prints:
  - N
  - Spearman rho (affinity vs iptm)
  - Pearson r (affinity vs iptm)
"""

from pathlib import Path
import numpy as np
import pandas as pd


def corr_stats(df: pd.DataFrame, x: str, y: str):
    # pandas handles NaNs if we already dropped them; still be safe
    xx = pd.to_numeric(df[x], errors="coerce")
    yy = pd.to_numeric(df[y], errors="coerce")
    m = xx.notna() & yy.notna()
    xx = xx[m]
    yy = yy[m]
    n = int(m.sum())

    # Pearson r
    pearson_r = float(xx.corr(yy, method="pearson")) if n > 1 else float("nan")
    # Spearman rho
    spearman_rho = float(xx.corr(yy, method="spearman")) if n > 1 else float("nan")

    return {"n": n, "pearson_r": pearson_r, "spearman_rho": spearman_rho}


def clean_one(
    in_csv: Path,
    out_csv: Path,
    iptm_col: str,
    affinity_col: str = "affinity",
    keep_cols=(),
):
    df = pd.read_csv(in_csv)

    # affinity + iptm must exist
    need = [affinity_col, iptm_col]
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError(f"{in_csv} missing columns: {missing}. Found: {list(df.columns)}")

    # coerce numeric
    df[affinity_col] = pd.to_numeric(df[affinity_col], errors="coerce")
    df[iptm_col] = pd.to_numeric(df[iptm_col], errors="coerce")

    # drop NaNs in either
    df = df.dropna(subset=[affinity_col, iptm_col]).reset_index(drop=True)

    # output cols (standardize names)
    out = pd.DataFrame({
        "affinity": df[affinity_col].astype(float),
        "iptm": df[iptm_col].astype(float),
    })

    # keep split if present (handy for coloring later, but not used for corr)
    if "split" in df.columns:
        out.insert(0, "split", df["split"].astype(str))

    # optional extras for labeling/debug
    for c in keep_cols:
        if c in df.columns:
            out[c] = df[c]

    out_csv.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(out_csv, index=False)

    stats = corr_stats(out, "iptm", "affinity")
    print(f"[write] {out_csv}")
    print(f"  N={stats['n']} | Pearson r={stats['pearson_r']:.4f} | Spearman rho={stats['spearman_rho']:.4f}")

    # also save stats json next to csv
    stats_path = out_csv.with_suffix(".stats.json")
    with open(stats_path, "w") as f:
        import json
        json.dump(
            {
                "input_csv": str(in_csv),
                "output_csv": str(out_csv),
                "iptm_col": iptm_col,
                "affinity_col": affinity_col,
                **stats,
            },
            f,
            indent=2,
        )


def main():
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--wt_meta_csv", type=str, required=True)
    ap.add_argument("--smiles_meta_csv", type=str, required=True)
    ap.add_argument("--out_dir", type=str, required=True)

    ap.add_argument("--wt_iptm_col", type=str, default="wt_iptm_score")
    ap.add_argument("--smiles_iptm_col", type=str, default="smiles_iptm_score")
    ap.add_argument("--affinity_col", type=str, default="affinity")
    args = ap.parse_args()

    out_dir = Path(args.out_dir)

    clean_one(
        Path(args.wt_meta_csv),
        out_dir / "wt_iptm_affinity_all.csv",
        iptm_col=args.wt_iptm_col,
        affinity_col=args.affinity_col,
        keep_cols=("seq1", "seq2", "Fasta2SMILES", "REACT_SMILES"),
    )

    clean_one(
        Path(args.smiles_meta_csv),
        out_dir / "smiles_iptm_affinity_all.csv",
        iptm_col=args.smiles_iptm_col,
        affinity_col=args.affinity_col,
        keep_cols=("seq1", "seq2", "Fasta2SMILES", "REACT_SMILES", "smiles_sequence"),
    )

    print(f"\n[DONE] CSVs + stats JSONs in: {out_dir}")


if __name__ == "__main__":
    main()