Spaces:
No application file
No application file
File size: 8,937 Bytes
6355ab5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import pandas as pd
import numpy as np
from scipy import stats
from .featureEngineering import parse_timepoint
# Map analyte base names to human labels + units + reference ranges
## To get sub and superscripts in Markdown I used ChatGPT: https://chatgpt.com/share/68d9c8f6-2674-8008-8ff7-0731bec9ad49
ANALYTE_INFO = {
#Blood Chemistry
"albumin": {"label": "Albumin", "unit": "g/dL"},
"alkaline_phosphatase": {"label": "Alkaline Phosphatase", "unit": "U/L"},
"alt": {"label": "ALT", "unit": "U/L"},
"ast": {"label": "AST", "unit": "U/L"},
"total_bilirubin": {"label": "Bilirubin", "unit": "mg/dL"},
"bun_to_creatinine_ratio": {"label": "BUN/Creatinine Ratio", "unit": ""},
"calcium": {"label": "Ca²⁺", "unit": "mg/dL"},
"carbon_dioxide": {"label": "CO₂", "unit": "mmol/L"},
"chloride": {"label": "Cl⁻", "unit": "mmol/L"},
"creatinine": {"label": "Creatinine", "unit": "mg/dL"},
"egfr_african_american": {"label": "eGFR (AA)", "unit": "mL/min/1.73m²"},
"egfr_non_african_american": {"label": "eGFR (non-AA)", "unit": "mL/min/1.73m²"},
"globulin": {"label": "Globulin", "unit": "g/dL"},
"glucose": {"label": "Glucose", "unit": "mg/dL"},
"potassium": {"label": "K⁺", "unit": "mmol/L"},
"total_protein": {"label": "Protein", "unit": "g/dL"},
"sodium": {"label": "Na⁺", "unit": "mmol/L"},
"urea_nitrogen_bun": {"label": "BUN", "unit": "mg/dL"},
# Derived feature
"anion_gap": {
"label": "Anion Gap",
"unit": "mmol/L",
"min": 8, # manual reference range
"max": 24
},
## cardiovascular
## Cardiovascular
"a2_macroglobulin": {"label": "α₂-Macroglobulin", "unit": "ng/mL"},
"agp": {"label": "AGP (α1-acid glycoprotein)", "unit": "ng/mL"},
"crp": {"label": "CRP (C-reactive protein)", "unit": "pg/mL"},
"fetuin_a36": {"label": "Fetuin A3/6", "unit": "ng/mL"},
"fibrinogen": {"label": "Fibrinogen", "unit": "ng/mL"},
"haptoglobin": {"label": "Haptoglobin", "unit": "ng/mL"},
"l_selectin": {"label": "L-Selectin", "unit": "pg/mL"},
"pf4": {"label": "Platelet Factor 4", "unit": "ng/mL"},
"sap": {"label": "SAP (Serum Amyloid P)", "unit": "pg/mL"},
}
# Helpers to find columns by prefix (robust to unit suffixes)
def _first_col_startswith(df: pd.DataFrame, prefixes) -> str | None:
"""
Return the first column whose lowercase name starts with any prefix in `prefixes`.
"""
if isinstance(prefixes, str):
prefixes = [prefixes]
prefixes = [p.lower() for p in prefixes]
for col in df.columns:
cl = col.lower()
for p in prefixes:
if cl.startswith(p):
return col
return None
def _value_min_max_cols(df: pd.DataFrame, analyte: str):
"""
For a given base analyte name, return (value_col, min_col, max_col).
Works with clinical chemistry (…_value) and cardiovascular (…_concentration / …_percent).
"""
v = _first_col_startswith(df, f"{analyte}_value")
if v is None:
v = _first_col_startswith(df, f"{analyte}_concentration")
mn = _first_col_startswith(df, [f"{analyte}_range_min", f"{analyte}_min"])
mx = _first_col_startswith(df, [f"{analyte}_range_max", f"{analyte}_max"])
return v, mn, mx
# Tidy Transformation
def tidy_from_wide(df: pd.DataFrame) -> pd.DataFrame:
"""
Transform astronaut CSV with value/min/max triplets into tidy format.
Adds derived analytes (like Anion Gap) using flexible column matching.
Returns: columns [astronautID, timepoint, flight_day, analyte, value, min, max, unit, label, sex]
"""
tidy_records = []
# normalize lookup for id/timepoint columns
colmap = {c.lower(): c for c in df.columns}
astronaut_col = colmap.get("astronautid")
timepoint_col = colmap.get("timepoint")
if astronaut_col is None or timepoint_col is None:
raise KeyError("Expected astronautID and timepoint columns in input CSV")
for analyte, meta in ANALYTE_INFO.items():
if analyte == "anion_gap":
continue
value_col, min_col, max_col = _value_min_max_cols(df, analyte)
if value_col is None:
continue
for _, row in df.iterrows():
rec = {
"astronautID": row[astronaut_col],
"timepoint": row[timepoint_col],
"flight_day": parse_timepoint(row[timepoint_col]),
"analyte": analyte,
"value": row[value_col],
"min": (row[min_col] if (min_col and pd.notna(row[min_col])) else meta.get("min")),
"max": (row[max_col] if (max_col and pd.notna(row[max_col])) else meta.get("max")),
"label": meta["label"],
"unit": meta["unit"],
"sex": "Male" if str(row[astronaut_col]) in ["C001", "C004"] else "Female",
}
tidy_records.append(rec)
return pd.DataFrame(tidy_records)
# Statistical Comparison: R+1 vs L-series
def analyze_r1_vs_L(tidy: pd.DataFrame) -> pd.DataFrame:
"""
Compare R+1 vs L-series for each analyte.
- Within-astronaut: one-sample t-test (H0: mean(L) == R+1)
Returns per-astronaut mean, std, SE, t-stat, p-value, and Cohen's d.
- Across-astronauts (group-level): paired t-test on per-astronaut mean(L) vs R+1
Returns group mean, std across astronauts, SEM, t-stat, p-value, and Cohen's d.
"""
results = []
for analyte, subdf in tidy.groupby("analyte"):
## Within-astronaut tests
for astronaut, adf in subdf.groupby("astronautID"):
L_mask = adf["timepoint"].astype(str).str.startswith("L")
R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])
L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)
if len(L_vals) >= 2 and len(R1_vals) == 1:
R1 = float(R1_vals.iloc[0])
mean_L = float(L_vals.mean())
std_L = float(L_vals.std(ddof=1))
n_L = int(L_vals.shape[0])
if std_L > 0:
se = std_L / np.sqrt(n_L)
t_stat = (mean_L - R1) / se
p_val = 2 * (1 - stats.t.cdf(abs(t_stat), df=n_L - 1))
cohen_d = (R1 - mean_L) / std_L
else:
se = t_stat = p_val = cohen_d = np.nan
results.append({
"analyte": analyte,
"astronautID": astronaut,
"test_type": "within",
"n_L": n_L,
"mean_L": round(mean_L, 2),
"R1": round(R1, 2),
"std_L": round(std_L, 2),
"se_L": round(se, 2) if pd.notna(se) else np.nan,
"t_stat": round(t_stat, 3) if pd.notna(t_stat) else np.nan,
"p_value": round(p_val, 4) if pd.notna(p_val) else np.nan,
"effect_size": round(cohen_d, 3) if pd.notna(cohen_d) else np.nan,
})
## Across-astronauts (paired test)
astronaut_means, astronaut_R1 = [], []
for astronaut, adf in subdf.groupby("astronautID"):
L_mask = adf["timepoint"].astype(str).str.startswith("L")
R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])
L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)
if len(L_vals) >= 2 and len(R1_vals) == 1:
astronaut_means.append(float(L_vals.mean()))
astronaut_R1.append(float(R1_vals.iloc[0]))
if len(astronaut_means) >= 2:
diffs = np.array(astronaut_R1) - np.array(astronaut_means)
t_stat, p_val = stats.ttest_rel(astronaut_R1, astronaut_means)
# Group-level variability
std_L = np.std(astronaut_means, ddof=1)
se_L = std_L / np.sqrt(len(astronaut_means))
cohen_d = diffs.mean() / diffs.std(ddof=1) if diffs.std(ddof=1) > 0 else np.nan
results.append({
"analyte": analyte,
"astronautID": "ALL",
"test_type": "group",
"n_L": len(astronaut_means),
"mean_L": round(float(np.mean(astronaut_means)), 2),
"R1": round(float(np.mean(astronaut_R1)), 2),
"t_stat": round(float(t_stat), 3),
"p_value": round(float(p_val), 4),
"effect_size": round(float(cohen_d), 3) if pd.notna(cohen_d) else np.nan,
})
return pd.DataFrame(results)
|