Inspiration-Health-Data / scripts /featureEngineering.py
DTanzillo's picture
Upload 5 files
1cfa929 verified
import numpy as np
import pandas as pd
def parse_timepoint(timepoint: str) -> int:
"""
Convert timepoint strings like 'L-3', 'L0', 'R+0', 'R+1' into numeric flight days
on a stretched scale.
In particular, we are converting the 3 dats of flight into 30 days so there is a
difference, the final chart will have fake data in it.
Convention:
L-0 -> 0 (launch day = Flight Day 0)
L-3 -> -3 (3 days before launch)
R+0 -> 30 (last day in space, stretched to day 30)
R+1 -> 31 (first recovery day)
R+N -> N+30 (general rule for post-launch days)
"""
label = str(timepoint).strip().upper()
if label.startswith("L"): # Pre-launch
number = int(label.replace("L", "").replace("+", "").replace("-", "") or "0")
return -number
elif label.startswith("R"): # Return / post-flight
number = int(label.replace("R", "").replace("+", "").replace("-", "") or "0")
return number + 30
return np.nan
def add_flight_day(df: pd.DataFrame) -> pd.DataFrame:
"""
Add a 'flight_day' column to a dataframe that already has 'timepoint' and 'astronautID'.
Drops 'Sample Name' if present, since it's redundant.
"""
df = df.copy()
if "timepoint" not in df.columns:
raise ValueError("DataFrame must contain a 'timepoint' column")
# create numeric scale
df["flight_day"] = df["timepoint"].apply(parse_timepoint)
# drop redundant 'Sample Name' if it exists
if "Sample Name" in df.columns:
df = df.drop(columns=["Sample Name"])
return df
def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Add derived feature: Anion Gap.
Anion Gap = Sodium − Chloride − Carbon Dioxide
"""
df = df.copy()
if all(c in df.columns for c in ["sodium_value", "chloride_value", "carbon_dioxide_value"]):
df["anion_gap_value"] = (
df["sodium_value"].astype(float)
- df["chloride_value"].astype(float)
- df["carbon_dioxide_value"].astype(float)
)
# Placeholders; min/max defined manually in stats.ANALYTE_INFO
df["anion_gap_range_min"] = np.nan
df["anion_gap_range_max"] = np.nan
return df