microbe-model / scripts /31_merge_features.py
Miyu Horiuchi
Deploy app from main@a3254bf (no paper/ binaries)
0ed74db
"""Merge per-genome and per-strain feature tables into a single features.parquet.
Joins:
data/features.parquet (per-strain, base genome composition: 355 cols)
+ data/kegg_modules.parquet (per-genome, KEGG module completeness)
+ data/hmm_features.parquet (per-genome, unified marker HMM hits)
+ data/isolation_metadata.parquet (per-strain, BacDive isolation source)
Output: data/features.parquet (overwritten, backup saved to features.parquet.bak).
"""
from __future__ import annotations
import shutil
from pathlib import Path
import pandas as pd
from microbe_model import config
def main() -> None:
D: Path = config.DATA
print("Loading inputs...")
base = pd.read_parquet(D / "features.parquet")
kegg = pd.read_parquet(D / "kegg_modules.parquet")
hmm = pd.read_parquet(D / "hmm_features.parquet")
iso = pd.read_parquet(D / "isolation_metadata.parquet")
# Normalize join keys
base["genome_accession"] = base["genome_accession"].astype(str)
kegg["genome_accession"] = kegg["genome_accession"].astype(str)
hmm["genome_accession"] = hmm["genome_accession"].astype(str)
base["bacdive_id"] = base["bacdive_id"].astype(int)
iso["bacdive_id"] = iso["bacdive_id"].astype(int)
print(f" base: {len(base):>6,} × {base.shape[1]:>4}")
print(f" kegg: {len(kegg):>6,} × {kegg.shape[1]:>4}")
print(f" hmm: {len(hmm):>6,} × {hmm.shape[1]:>4}")
print(f" iso: {len(iso):>6,} × {iso.shape[1]:>4}")
# Drop string-typed columns from isolation_metadata; numeric/one-hot encoded
# versions of the same fields already exist alongside them.
iso_numeric = iso.select_dtypes(include=["number", "bool"]).copy()
if "bacdive_id" not in iso_numeric.columns:
iso_numeric["bacdive_id"] = iso["bacdive_id"].astype(int)
dropped = sorted(set(iso.columns) - set(iso_numeric.columns))
if dropped:
print(f" dropping {len(dropped)} non-numeric iso cols: {dropped}")
merged = base.merge(kegg, on="genome_accession", how="left")
merged = merged.merge(hmm, on="genome_accession", how="left", suffixes=("", "_hmm"))
merged = merged.merge(iso_numeric, on="bacdive_id", how="left", suffixes=("", "_iso"))
# Drop columns from secondary tables that collided with base (suffix _iso/_hmm)
junk = [c for c in merged.columns if c.endswith("_iso_dup") or c.endswith("_hmm_dup")]
if junk:
merged = merged.drop(columns=junk)
print(f"\nmerged: {len(merged):,} rows × {merged.shape[1]} cols")
print(f" rows with kegg features: {merged.filter(like='kegg_').notna().any(axis=1).sum():,}")
print(f" rows with hmm features: {merged.filter(like='hmm_').notna().any(axis=1).sum():,}")
print(f" rows with iso features: {merged.filter(like='iso_').notna().any(axis=1).sum():,}")
out = D / "features.parquet"
backup = D / "features.parquet.bak"
if out.exists() and not backup.exists():
shutil.copy2(out, backup)
print(f"\nbacked up original to {backup}")
merged.to_parquet(out, index=False)
print(f"wrote {out}: {len(merged):,} rows × {merged.shape[1]} cols")
if __name__ == "__main__":
main()