Spaces:
No application file
No application file
Delete scripts
Browse files- scripts/featureEngineering.py +0 -63
- scripts/graphMaking.py +0 -256
- scripts/stats.py +0 -205
scripts/featureEngineering.py
DELETED
|
@@ -1,63 +0,0 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
def parse_timepoint(timepoint: str) -> int:
|
| 5 |
-
"""
|
| 6 |
-
Convert timepoint strings like 'L-3', 'L0', 'R+0', 'R+1' into numeric flight days
|
| 7 |
-
on a stretched scale.
|
| 8 |
-
In particular, we are converting the 3 dats of flight into 30 days so there is a
|
| 9 |
-
difference, the final chart will have fake data in it.
|
| 10 |
-
Convention:
|
| 11 |
-
L-0 -> 0 (launch day = Flight Day 0)
|
| 12 |
-
L-3 -> -3 (3 days before launch)
|
| 13 |
-
R+0 -> 30 (last day in space, stretched to day 30)
|
| 14 |
-
R+1 -> 31 (first recovery day)
|
| 15 |
-
R+N -> N+30 (general rule for post-launch days)
|
| 16 |
-
"""
|
| 17 |
-
label = str(timepoint).strip().upper()
|
| 18 |
-
|
| 19 |
-
if label.startswith("L"): # Pre-launch
|
| 20 |
-
number = int(label.replace("L", "").replace("+", "").replace("-", "") or "0")
|
| 21 |
-
return -number
|
| 22 |
-
elif label.startswith("R"): # Return / post-flight
|
| 23 |
-
number = int(label.replace("R", "").replace("+", "").replace("-", "") or "0")
|
| 24 |
-
return number + 30
|
| 25 |
-
|
| 26 |
-
return np.nan
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def add_flight_day(df: pd.DataFrame) -> pd.DataFrame:
|
| 30 |
-
"""
|
| 31 |
-
Add a 'flight_day' column to a dataframe that already has 'timepoint' and 'astronautID'.
|
| 32 |
-
Drops 'Sample Name' if present, since it's redundant.
|
| 33 |
-
"""
|
| 34 |
-
df = df.copy()
|
| 35 |
-
if "timepoint" not in df.columns:
|
| 36 |
-
raise ValueError("DataFrame must contain a 'timepoint' column")
|
| 37 |
-
|
| 38 |
-
# create numeric scale
|
| 39 |
-
df["flight_day"] = df["timepoint"].apply(parse_timepoint)
|
| 40 |
-
|
| 41 |
-
# drop redundant 'Sample Name' if it exists
|
| 42 |
-
if "Sample Name" in df.columns:
|
| 43 |
-
df = df.drop(columns=["Sample Name"])
|
| 44 |
-
|
| 45 |
-
return df
|
| 46 |
-
|
| 47 |
-
def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
|
| 48 |
-
"""
|
| 49 |
-
Add derived feature: Anion Gap.
|
| 50 |
-
Anion Gap = Sodium − Chloride − Carbon Dioxide
|
| 51 |
-
"""
|
| 52 |
-
df = df.copy()
|
| 53 |
-
|
| 54 |
-
if all(c in df.columns for c in ["sodium_value", "chloride_value", "carbon_dioxide_value"]):
|
| 55 |
-
df["anion_gap_value"] = (
|
| 56 |
-
df["sodium_value"].astype(float)
|
| 57 |
-
- df["chloride_value"].astype(float)
|
| 58 |
-
- df["carbon_dioxide_value"].astype(float)
|
| 59 |
-
)
|
| 60 |
-
# Placeholders; min/max defined manually in stats.ANALYTE_INFO
|
| 61 |
-
df["anion_gap_range_min"] = np.nan
|
| 62 |
-
df["anion_gap_range_max"] = np.nan
|
| 63 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/graphMaking.py
DELETED
|
@@ -1,256 +0,0 @@
|
|
| 1 |
-
import plotly.graph_objects as go
|
| 2 |
-
import plotly.express as px
|
| 3 |
-
import numpy as np
|
| 4 |
-
import pandas as pd
|
| 5 |
-
|
| 6 |
-
def make_figure(
|
| 7 |
-
tidy_df: pd.DataFrame,
|
| 8 |
-
stats_df: pd.DataFrame,
|
| 9 |
-
analytes: list,
|
| 10 |
-
astronaut_filter=None,
|
| 11 |
-
show_error: str = None
|
| 12 |
-
):
|
| 13 |
-
"""
|
| 14 |
-
Build interactive mission-day plots with stats overlays.
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
fig = go.Figure()
|
| 18 |
-
|
| 19 |
-
# Highlight stretched space interval (0 to 30 days)
|
| 20 |
-
fig.add_vrect(x0=0, x1=30, fillcolor="LightGray", opacity=0.3,
|
| 21 |
-
layer="below", line_width=0)
|
| 22 |
-
for day in [10, 20]:
|
| 23 |
-
fig.add_vline(x=day, line=dict(color="white", width=2, dash="dot"),
|
| 24 |
-
layer="below")
|
| 25 |
-
|
| 26 |
-
df = tidy_df.copy()
|
| 27 |
-
|
| 28 |
-
# Apply participant filter
|
| 29 |
-
if astronaut_filter is None:
|
| 30 |
-
pass # show all
|
| 31 |
-
elif isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
|
| 32 |
-
if "sex" in df.columns:
|
| 33 |
-
df = df[df["sex"] == astronaut_filter]
|
| 34 |
-
elif isinstance(astronaut_filter, (list, tuple, set)):
|
| 35 |
-
df = df[df["astronautID"].isin(astronaut_filter)]
|
| 36 |
-
|
| 37 |
-
# Loop analytes requested
|
| 38 |
-
for analyte in analytes:
|
| 39 |
-
subdf = df[df["analyte"] == analyte]
|
| 40 |
-
if subdf.empty:
|
| 41 |
-
print(f"[make_figure] Skipping {analyte} – no data")
|
| 42 |
-
continue
|
| 43 |
-
|
| 44 |
-
## Y-axis scaling
|
| 45 |
-
ref_min = subdf["min"].dropna().min()
|
| 46 |
-
ref_max = subdf["max"].dropna().max()
|
| 47 |
-
data_min = subdf["value"].min()
|
| 48 |
-
data_max = subdf["value"].max()
|
| 49 |
-
|
| 50 |
-
if "unit" in subdf.columns and not subdf["unit"].dropna().empty:
|
| 51 |
-
unit = subdf["unit"].dropna().iloc[0]
|
| 52 |
-
y_label = f"{analyte.title()} ({unit})"
|
| 53 |
-
else:
|
| 54 |
-
y_label = analyte.title()
|
| 55 |
-
|
| 56 |
-
## Add healthy range lines from min / max
|
| 57 |
-
if pd.notna(ref_min):
|
| 58 |
-
fig.add_hline(
|
| 59 |
-
y=ref_min,
|
| 60 |
-
line=dict(color="green", width=2, dash="dot"),
|
| 61 |
-
annotation_text="Min",
|
| 62 |
-
annotation_position="bottom right"
|
| 63 |
-
)
|
| 64 |
-
if pd.notna(ref_max):
|
| 65 |
-
fig.add_hline(
|
| 66 |
-
y=ref_max,
|
| 67 |
-
line=dict(color="green", width=2, dash="dot"),
|
| 68 |
-
annotation_text="Max",
|
| 69 |
-
annotation_position="top right"
|
| 70 |
-
)
|
| 71 |
-
|
| 72 |
-
## Decide axis limits: must include BOTH healthy range and all data
|
| 73 |
-
low_candidates = [v for v in [ref_min, data_min] if pd.notna(v)]
|
| 74 |
-
high_candidates = [v for v in [ref_max, data_max] if pd.notna(v)]
|
| 75 |
-
|
| 76 |
-
if low_candidates and high_candidates:
|
| 77 |
-
low = min(low_candidates)
|
| 78 |
-
high = max(high_candidates)
|
| 79 |
-
span = high - low if high > low else 1
|
| 80 |
-
padding = 0.1 * span
|
| 81 |
-
y_range = [low - padding, high + padding]
|
| 82 |
-
else:
|
| 83 |
-
y_range = None
|
| 84 |
-
|
| 85 |
-
## Apply axis update once
|
| 86 |
-
if y_range:
|
| 87 |
-
fig.update_yaxes(title=y_label, range=y_range)
|
| 88 |
-
else:
|
| 89 |
-
fig.update_yaxes(title=y_label)
|
| 90 |
-
|
| 91 |
-
## Plot each astronaut trace - first colors
|
| 92 |
-
palette = px.colors.qualitative.Set2
|
| 93 |
-
astronaut_colors = {astr: palette[i % len(palette)]
|
| 94 |
-
for i, astr in enumerate(subdf["astronautID"].unique())}
|
| 95 |
-
|
| 96 |
-
## Plot each astronaut trace
|
| 97 |
-
for astronaut, adf in subdf.groupby("astronautID"):
|
| 98 |
-
if adf.empty:
|
| 99 |
-
continue
|
| 100 |
-
adf = adf.sort_values("flight_day")
|
| 101 |
-
base_color = astronaut_colors[astronaut]
|
| 102 |
-
|
| 103 |
-
### Skip if astronaut not in filter
|
| 104 |
-
if isinstance(astronaut_filter, (list, tuple, set)) and astronaut not in astronaut_filter:
|
| 105 |
-
continue
|
| 106 |
-
|
| 107 |
-
# Main Scatter Plot
|
| 108 |
-
fig.add_trace(go.Scatter(
|
| 109 |
-
x=adf["flight_day"],
|
| 110 |
-
y=adf["value"],
|
| 111 |
-
mode="lines+markers",
|
| 112 |
-
name=f"{astronaut} ({analyte})",
|
| 113 |
-
hovertext=adf["timepoint"],
|
| 114 |
-
hovertemplate="Day %{hovertext}<br>Value %{y}<extra></extra>",
|
| 115 |
-
line=dict(color=base_color),
|
| 116 |
-
marker=dict(color=base_color)
|
| 117 |
-
))
|
| 118 |
-
|
| 119 |
-
### Within-astronaut error band
|
| 120 |
-
if show_error == "within" and not stats_df.empty:
|
| 121 |
-
stat_rows = stats_df[
|
| 122 |
-
(stats_df["analyte"] == analyte)
|
| 123 |
-
& (stats_df["test_type"] == "within")
|
| 124 |
-
]
|
| 125 |
-
|
| 126 |
-
for _, row in stat_rows.iterrows():
|
| 127 |
-
astronaut = row["astronautID"]
|
| 128 |
-
if astronaut not in subdf["astronautID"].unique():
|
| 129 |
-
continue # skip astronauts not in this analyte subset
|
| 130 |
-
|
| 131 |
-
mean_L = row.get("mean_L", np.nan)
|
| 132 |
-
se = row.get("se_L", np.nan)
|
| 133 |
-
R1 = row.get("R1", np.nan)
|
| 134 |
-
|
| 135 |
-
if pd.isna(mean_L) or pd.isna(se):
|
| 136 |
-
continue
|
| 137 |
-
|
| 138 |
-
base_color = astronaut_colors.get(astronaut, "gray")
|
| 139 |
-
if base_color.startswith("rgb"):
|
| 140 |
-
fill_color = base_color.replace("rgb", "rgba").replace(")", ",0.15)")
|
| 141 |
-
else:
|
| 142 |
-
fill_color = base_color
|
| 143 |
-
|
| 144 |
-
#### Horizontal band: L +/- SE
|
| 145 |
-
fig.add_hrect(
|
| 146 |
-
y0=mean_L - se, y1=mean_L + se,
|
| 147 |
-
fillcolor=fill_color,
|
| 148 |
-
opacity=0.2,
|
| 149 |
-
line_width=0,
|
| 150 |
-
layer="below"
|
| 151 |
-
)
|
| 152 |
-
|
| 153 |
-
#### Asterisk if R+1 outside band
|
| 154 |
-
if pd.notna(R1) and (R1 < mean_L - se or R1 > mean_L + se):
|
| 155 |
-
fig.add_annotation(
|
| 156 |
-
x=31,
|
| 157 |
-
y=R1,
|
| 158 |
-
text="*",
|
| 159 |
-
showarrow=False,
|
| 160 |
-
font=dict(size=20, color="red"),
|
| 161 |
-
yshift=15
|
| 162 |
-
)
|
| 163 |
-
|
| 164 |
-
## Group-level error band
|
| 165 |
-
if show_error == "group" and not stats_df.empty:
|
| 166 |
-
stat_rows = stats_df[
|
| 167 |
-
(stats_df["analyte"] == analyte)
|
| 168 |
-
& (stats_df["test_type"] == "group")
|
| 169 |
-
]
|
| 170 |
-
|
| 171 |
-
for _, row in stat_rows.iterrows():
|
| 172 |
-
mean_L = row.get("mean_L", np.nan)
|
| 173 |
-
n = row.get("n_L", 0)
|
| 174 |
-
|
| 175 |
-
error = np.nan
|
| 176 |
-
if pd.notna(row.get("effect_size")) and n > 1 and row["effect_size"] != 0:
|
| 177 |
-
error = abs(row.get("R1", np.nan) - mean_L) / abs(row["effect_size"])
|
| 178 |
-
if pd.isna(error):
|
| 179 |
-
error = 0
|
| 180 |
-
|
| 181 |
-
#### Filter bands only if stats_df has group info
|
| 182 |
-
should_plot = True
|
| 183 |
-
if "group" in row.index and astronaut_filter is not None:
|
| 184 |
-
group_id = row["group"]
|
| 185 |
-
|
| 186 |
-
if isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
|
| 187 |
-
should_plot = (group_id == astronaut_filter)
|
| 188 |
-
elif isinstance(astronaut_filter, (list, tuple, set)):
|
| 189 |
-
# Only show if group_id matches one of the selected astronauts
|
| 190 |
-
should_plot = (group_id in astronaut_filter)
|
| 191 |
-
|
| 192 |
-
if should_plot and pd.notna(mean_L):
|
| 193 |
-
fig.add_hrect(
|
| 194 |
-
y0=mean_L - error, y1=mean_L + error,
|
| 195 |
-
fillcolor="gray", opacity=0.2,
|
| 196 |
-
layer="below", line_width=0,
|
| 197 |
-
annotation_text = "Group Error Band",
|
| 198 |
-
annotation_position="top left"
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
if row.get("p_value") is not None and row["p_value"] < 0.05:
|
| 202 |
-
fig.add_annotation(
|
| 203 |
-
x=31, # R+1 = 31
|
| 204 |
-
y=row.get("R1", mean_L),
|
| 205 |
-
text="*",
|
| 206 |
-
showarrow=False,
|
| 207 |
-
font=dict(size=20, color="red"),
|
| 208 |
-
yshift=15
|
| 209 |
-
)
|
| 210 |
-
|
| 211 |
-
## Only update range if ref_min/ref_max are valid
|
| 212 |
-
if pd.notna(ref_min) and pd.notna(ref_max):
|
| 213 |
-
fig.update_yaxes(title=y_label,
|
| 214 |
-
range=[ref_min * 0.9, ref_max * 1.1])
|
| 215 |
-
else:
|
| 216 |
-
fig.update_yaxes(title=y_label)
|
| 217 |
-
|
| 218 |
-
# Layout: Build Dynamic Title
|
| 219 |
-
if astronaut_filter is None:
|
| 220 |
-
group_label = "All Participants"
|
| 221 |
-
elif isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
|
| 222 |
-
group_label = f"{astronaut_filter} Participants"
|
| 223 |
-
elif isinstance(astronaut_filter, (list, tuple, set)):
|
| 224 |
-
group_label = "Subset: " + ", ".join(astronaut_filter)
|
| 225 |
-
else:
|
| 226 |
-
group_label = "Participants"
|
| 227 |
-
|
| 228 |
-
# Build analyte label with units if available
|
| 229 |
-
ana_label = ", ".join(analytes)
|
| 230 |
-
unit_label = ""
|
| 231 |
-
subdf = df[df["analyte"] == analytes[0]]
|
| 232 |
-
if "unit" in subdf.columns and not subdf["unit"].dropna().empty:
|
| 233 |
-
unit_label = f" ({subdf['unit'].dropna().iloc[0]})"
|
| 234 |
-
|
| 235 |
-
fig.update_layout(
|
| 236 |
-
title=f"{ana_label.title()}{unit_label} Trends ({group_label})",
|
| 237 |
-
xaxis_title="Mission Day",
|
| 238 |
-
legend_title="Participant / Analyte",
|
| 239 |
-
hovermode="x unified",
|
| 240 |
-
template="plotly_white",
|
| 241 |
-
margin=dict(l=60, r=30, t=60, b=60)
|
| 242 |
-
)
|
| 243 |
-
|
| 244 |
-
# Custom ticks
|
| 245 |
-
ticks = [t for t in sorted(df["flight_day"].dropna().unique()) if pd.notna(t)]
|
| 246 |
-
ticktext = []
|
| 247 |
-
for t in ticks:
|
| 248 |
-
if t >= 30:
|
| 249 |
-
lbl = f"R+{int(t-30)}"
|
| 250 |
-
else:
|
| 251 |
-
lbl = f"L{int(t)}"
|
| 252 |
-
ticktext.append(lbl)
|
| 253 |
-
if ticks:
|
| 254 |
-
fig.update_xaxes(tickmode="array", tickvals=ticks, ticktext=ticktext)
|
| 255 |
-
|
| 256 |
-
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/stats.py
DELETED
|
@@ -1,205 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
import numpy as np
|
| 3 |
-
from scipy import stats
|
| 4 |
-
from .featureEngineering import parse_timepoint
|
| 5 |
-
|
| 6 |
-
# Map analyte base names to human labels + units + reference ranges
|
| 7 |
-
## To get sub and superscripts in Markdown I used ChatGPT: https://chatgpt.com/share/68d9c8f6-2674-8008-8ff7-0731bec9ad49
|
| 8 |
-
ANALYTE_INFO = {
|
| 9 |
-
#Blood Chemistry
|
| 10 |
-
"albumin": {"label": "Albumin", "unit": "g/dL"},
|
| 11 |
-
"alkaline_phosphatase": {"label": "Alkaline Phosphatase", "unit": "U/L"},
|
| 12 |
-
"alt": {"label": "ALT", "unit": "U/L"},
|
| 13 |
-
"ast": {"label": "AST", "unit": "U/L"},
|
| 14 |
-
"total_bilirubin": {"label": "Bilirubin", "unit": "mg/dL"},
|
| 15 |
-
"bun_to_creatinine_ratio": {"label": "BUN/Creatinine Ratio", "unit": ""},
|
| 16 |
-
"calcium": {"label": "Ca²⁺", "unit": "mg/dL"},
|
| 17 |
-
"carbon_dioxide": {"label": "CO₂", "unit": "mmol/L"},
|
| 18 |
-
"chloride": {"label": "Cl⁻", "unit": "mmol/L"},
|
| 19 |
-
"creatinine": {"label": "Creatinine", "unit": "mg/dL"},
|
| 20 |
-
"egfr_african_american": {"label": "eGFR (AA)", "unit": "mL/min/1.73m²"},
|
| 21 |
-
"egfr_non_african_american": {"label": "eGFR (non-AA)", "unit": "mL/min/1.73m²"},
|
| 22 |
-
"globulin": {"label": "Globulin", "unit": "g/dL"},
|
| 23 |
-
"glucose": {"label": "Glucose", "unit": "mg/dL"},
|
| 24 |
-
"potassium": {"label": "K⁺", "unit": "mmol/L"},
|
| 25 |
-
"total_protein": {"label": "Protein", "unit": "g/dL"},
|
| 26 |
-
"sodium": {"label": "Na⁺", "unit": "mmol/L"},
|
| 27 |
-
"urea_nitrogen_bun": {"label": "BUN", "unit": "mg/dL"},
|
| 28 |
-
|
| 29 |
-
# Derived feature
|
| 30 |
-
"anion_gap": {
|
| 31 |
-
"label": "Anion Gap",
|
| 32 |
-
"unit": "mmol/L",
|
| 33 |
-
"min": 8, # manual reference range
|
| 34 |
-
"max": 24
|
| 35 |
-
},
|
| 36 |
-
|
| 37 |
-
## cardiovascular
|
| 38 |
-
## Cardiovascular
|
| 39 |
-
"a2_macroglobulin": {"label": "α₂-Macroglobulin", "unit": "ng/mL"},
|
| 40 |
-
"agp": {"label": "AGP (α1-acid glycoprotein)", "unit": "ng/mL"},
|
| 41 |
-
"crp": {"label": "CRP (C-reactive protein)", "unit": "pg/mL"},
|
| 42 |
-
"fetuin_a36": {"label": "Fetuin A3/6", "unit": "ng/mL"},
|
| 43 |
-
"fibrinogen": {"label": "Fibrinogen", "unit": "ng/mL"},
|
| 44 |
-
"haptoglobin": {"label": "Haptoglobin", "unit": "ng/mL"},
|
| 45 |
-
"l_selectin": {"label": "L-Selectin", "unit": "pg/mL"},
|
| 46 |
-
"pf4": {"label": "Platelet Factor 4", "unit": "ng/mL"},
|
| 47 |
-
"sap": {"label": "SAP (Serum Amyloid P)", "unit": "pg/mL"},
|
| 48 |
-
}
|
| 49 |
-
|
| 50 |
-
# Helpers to find columns by prefix (robust to unit suffixes)
|
| 51 |
-
def _first_col_startswith(df: pd.DataFrame, prefixes) -> str | None:
|
| 52 |
-
"""
|
| 53 |
-
Return the first column whose lowercase name starts with any prefix in `prefixes`.
|
| 54 |
-
"""
|
| 55 |
-
if isinstance(prefixes, str):
|
| 56 |
-
prefixes = [prefixes]
|
| 57 |
-
prefixes = [p.lower() for p in prefixes]
|
| 58 |
-
for col in df.columns:
|
| 59 |
-
cl = col.lower()
|
| 60 |
-
for p in prefixes:
|
| 61 |
-
if cl.startswith(p):
|
| 62 |
-
return col
|
| 63 |
-
return None
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
def _value_min_max_cols(df: pd.DataFrame, analyte: str):
|
| 67 |
-
"""
|
| 68 |
-
For a given base analyte name, return (value_col, min_col, max_col).
|
| 69 |
-
Works with clinical chemistry (…_value) and cardiovascular (…_concentration / …_percent).
|
| 70 |
-
"""
|
| 71 |
-
v = _first_col_startswith(df, f"{analyte}_value")
|
| 72 |
-
if v is None:
|
| 73 |
-
v = _first_col_startswith(df, f"{analyte}_concentration")
|
| 74 |
-
|
| 75 |
-
mn = _first_col_startswith(df, [f"{analyte}_range_min", f"{analyte}_min"])
|
| 76 |
-
mx = _first_col_startswith(df, [f"{analyte}_range_max", f"{analyte}_max"])
|
| 77 |
-
|
| 78 |
-
return v, mn, mx
|
| 79 |
-
|
| 80 |
-
# Tidy Transformation
|
| 81 |
-
def tidy_from_wide(df: pd.DataFrame) -> pd.DataFrame:
|
| 82 |
-
"""
|
| 83 |
-
Transform astronaut CSV with value/min/max triplets into tidy format.
|
| 84 |
-
Adds derived analytes (like Anion Gap) using flexible column matching.
|
| 85 |
-
Returns: columns [astronautID, timepoint, flight_day, analyte, value, min, max, unit, label, sex]
|
| 86 |
-
"""
|
| 87 |
-
tidy_records = []
|
| 88 |
-
|
| 89 |
-
# normalize lookup for id/timepoint columns
|
| 90 |
-
colmap = {c.lower(): c for c in df.columns}
|
| 91 |
-
astronaut_col = colmap.get("astronautid")
|
| 92 |
-
timepoint_col = colmap.get("timepoint")
|
| 93 |
-
|
| 94 |
-
if astronaut_col is None or timepoint_col is None:
|
| 95 |
-
raise KeyError("Expected astronautID and timepoint columns in input CSV")
|
| 96 |
-
|
| 97 |
-
for analyte, meta in ANALYTE_INFO.items():
|
| 98 |
-
if analyte == "anion_gap":
|
| 99 |
-
continue
|
| 100 |
-
|
| 101 |
-
value_col, min_col, max_col = _value_min_max_cols(df, analyte)
|
| 102 |
-
if value_col is None:
|
| 103 |
-
continue
|
| 104 |
-
|
| 105 |
-
for _, row in df.iterrows():
|
| 106 |
-
rec = {
|
| 107 |
-
"astronautID": row[astronaut_col],
|
| 108 |
-
"timepoint": row[timepoint_col],
|
| 109 |
-
"flight_day": parse_timepoint(row[timepoint_col]),
|
| 110 |
-
"analyte": analyte,
|
| 111 |
-
"value": row[value_col],
|
| 112 |
-
"min": (row[min_col] if (min_col and pd.notna(row[min_col])) else meta.get("min")),
|
| 113 |
-
"max": (row[max_col] if (max_col and pd.notna(row[max_col])) else meta.get("max")),
|
| 114 |
-
"label": meta["label"],
|
| 115 |
-
"unit": meta["unit"],
|
| 116 |
-
"sex": "Male" if str(row[astronaut_col]) in ["C001", "C004"] else "Female",
|
| 117 |
-
}
|
| 118 |
-
tidy_records.append(rec)
|
| 119 |
-
|
| 120 |
-
return pd.DataFrame(tidy_records)
|
| 121 |
-
|
| 122 |
-
# Statistical Comparison: R+1 vs L-series
|
| 123 |
-
def analyze_r1_vs_L(tidy: pd.DataFrame) -> pd.DataFrame:
|
| 124 |
-
"""
|
| 125 |
-
Compare R+1 vs L-series for each analyte.
|
| 126 |
-
- Within-astronaut: one-sample t-test (H0: mean(L) == R+1)
|
| 127 |
-
Returns per-astronaut mean, std, SE, t-stat, p-value, and Cohen's d.
|
| 128 |
-
- Across-astronauts (group-level): paired t-test on per-astronaut mean(L) vs R+1
|
| 129 |
-
Returns group mean, std across astronauts, SEM, t-stat, p-value, and Cohen's d.
|
| 130 |
-
"""
|
| 131 |
-
results = []
|
| 132 |
-
for analyte, subdf in tidy.groupby("analyte"):
|
| 133 |
-
|
| 134 |
-
## Within-astronaut tests
|
| 135 |
-
for astronaut, adf in subdf.groupby("astronautID"):
|
| 136 |
-
L_mask = adf["timepoint"].astype(str).str.startswith("L")
|
| 137 |
-
R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])
|
| 138 |
-
|
| 139 |
-
L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
|
| 140 |
-
R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)
|
| 141 |
-
|
| 142 |
-
if len(L_vals) >= 2 and len(R1_vals) == 1:
|
| 143 |
-
R1 = float(R1_vals.iloc[0])
|
| 144 |
-
mean_L = float(L_vals.mean())
|
| 145 |
-
std_L = float(L_vals.std(ddof=1))
|
| 146 |
-
n_L = int(L_vals.shape[0])
|
| 147 |
-
|
| 148 |
-
if std_L > 0:
|
| 149 |
-
se = std_L / np.sqrt(n_L)
|
| 150 |
-
t_stat = (mean_L - R1) / se
|
| 151 |
-
p_val = 2 * (1 - stats.t.cdf(abs(t_stat), df=n_L - 1))
|
| 152 |
-
cohen_d = (R1 - mean_L) / std_L
|
| 153 |
-
else:
|
| 154 |
-
se = t_stat = p_val = cohen_d = np.nan
|
| 155 |
-
|
| 156 |
-
results.append({
|
| 157 |
-
"analyte": analyte,
|
| 158 |
-
"astronautID": astronaut,
|
| 159 |
-
"test_type": "within",
|
| 160 |
-
"n_L": n_L,
|
| 161 |
-
"mean_L": round(mean_L, 2),
|
| 162 |
-
"R1": round(R1, 2),
|
| 163 |
-
"std_L": round(std_L, 2),
|
| 164 |
-
"se_L": round(se, 2) if pd.notna(se) else np.nan,
|
| 165 |
-
"t_stat": round(t_stat, 3) if pd.notna(t_stat) else np.nan,
|
| 166 |
-
"p_value": round(p_val, 4) if pd.notna(p_val) else np.nan,
|
| 167 |
-
"effect_size": round(cohen_d, 3) if pd.notna(cohen_d) else np.nan,
|
| 168 |
-
})
|
| 169 |
-
|
| 170 |
-
## Across-astronauts (paired test)
|
| 171 |
-
astronaut_means, astronaut_R1 = [], []
|
| 172 |
-
for astronaut, adf in subdf.groupby("astronautID"):
|
| 173 |
-
L_mask = adf["timepoint"].astype(str).str.startswith("L")
|
| 174 |
-
R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])
|
| 175 |
-
|
| 176 |
-
L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
|
| 177 |
-
R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)
|
| 178 |
-
|
| 179 |
-
if len(L_vals) >= 2 and len(R1_vals) == 1:
|
| 180 |
-
astronaut_means.append(float(L_vals.mean()))
|
| 181 |
-
astronaut_R1.append(float(R1_vals.iloc[0]))
|
| 182 |
-
|
| 183 |
-
if len(astronaut_means) >= 2:
|
| 184 |
-
diffs = np.array(astronaut_R1) - np.array(astronaut_means)
|
| 185 |
-
t_stat, p_val = stats.ttest_rel(astronaut_R1, astronaut_means)
|
| 186 |
-
|
| 187 |
-
# Group-level variability
|
| 188 |
-
std_L = np.std(astronaut_means, ddof=1)
|
| 189 |
-
se_L = std_L / np.sqrt(len(astronaut_means))
|
| 190 |
-
|
| 191 |
-
cohen_d = diffs.mean() / diffs.std(ddof=1) if diffs.std(ddof=1) > 0 else np.nan
|
| 192 |
-
|
| 193 |
-
results.append({
|
| 194 |
-
"analyte": analyte,
|
| 195 |
-
"astronautID": "ALL",
|
| 196 |
-
"test_type": "group",
|
| 197 |
-
"n_L": len(astronaut_means),
|
| 198 |
-
"mean_L": round(float(np.mean(astronaut_means)), 2),
|
| 199 |
-
"R1": round(float(np.mean(astronaut_R1)), 2),
|
| 200 |
-
"t_stat": round(float(t_stat), 3),
|
| 201 |
-
"p_value": round(float(p_val), 4),
|
| 202 |
-
"effect_size": round(float(cohen_d), 3) if pd.notna(cohen_d) else np.nan,
|
| 203 |
-
})
|
| 204 |
-
|
| 205 |
-
return pd.DataFrame(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|