DTanzillo commited on
Commit
77a7dbf
·
verified ·
1 Parent(s): e55c60e

Delete scripts

Browse files
scripts/featureEngineering.py DELETED
@@ -1,63 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
-
4
- def parse_timepoint(timepoint: str) -> int:
5
- """
6
- Convert timepoint strings like 'L-3', 'L0', 'R+0', 'R+1' into numeric flight days
7
- on a stretched scale.
8
- In particular, we are converting the 3 dats of flight into 30 days so there is a
9
- difference, the final chart will have fake data in it.
10
- Convention:
11
- L-0 -> 0 (launch day = Flight Day 0)
12
- L-3 -> -3 (3 days before launch)
13
- R+0 -> 30 (last day in space, stretched to day 30)
14
- R+1 -> 31 (first recovery day)
15
- R+N -> N+30 (general rule for post-launch days)
16
- """
17
- label = str(timepoint).strip().upper()
18
-
19
- if label.startswith("L"): # Pre-launch
20
- number = int(label.replace("L", "").replace("+", "").replace("-", "") or "0")
21
- return -number
22
- elif label.startswith("R"): # Return / post-flight
23
- number = int(label.replace("R", "").replace("+", "").replace("-", "") or "0")
24
- return number + 30
25
-
26
- return np.nan
27
-
28
-
29
- def add_flight_day(df: pd.DataFrame) -> pd.DataFrame:
30
- """
31
- Add a 'flight_day' column to a dataframe that already has 'timepoint' and 'astronautID'.
32
- Drops 'Sample Name' if present, since it's redundant.
33
- """
34
- df = df.copy()
35
- if "timepoint" not in df.columns:
36
- raise ValueError("DataFrame must contain a 'timepoint' column")
37
-
38
- # create numeric scale
39
- df["flight_day"] = df["timepoint"].apply(parse_timepoint)
40
-
41
- # drop redundant 'Sample Name' if it exists
42
- if "Sample Name" in df.columns:
43
- df = df.drop(columns=["Sample Name"])
44
-
45
- return df
46
-
47
- def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
48
- """
49
- Add derived feature: Anion Gap.
50
- Anion Gap = Sodium − Chloride − Carbon Dioxide
51
- """
52
- df = df.copy()
53
-
54
- if all(c in df.columns for c in ["sodium_value", "chloride_value", "carbon_dioxide_value"]):
55
- df["anion_gap_value"] = (
56
- df["sodium_value"].astype(float)
57
- - df["chloride_value"].astype(float)
58
- - df["carbon_dioxide_value"].astype(float)
59
- )
60
- # Placeholders; min/max defined manually in stats.ANALYTE_INFO
61
- df["anion_gap_range_min"] = np.nan
62
- df["anion_gap_range_max"] = np.nan
63
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/graphMaking.py DELETED
@@ -1,256 +0,0 @@
1
- import plotly.graph_objects as go
2
- import plotly.express as px
3
- import numpy as np
4
- import pandas as pd
5
-
6
- def make_figure(
7
- tidy_df: pd.DataFrame,
8
- stats_df: pd.DataFrame,
9
- analytes: list,
10
- astronaut_filter=None,
11
- show_error: str = None
12
- ):
13
- """
14
- Build interactive mission-day plots with stats overlays.
15
- """
16
-
17
- fig = go.Figure()
18
-
19
- # Highlight stretched space interval (0 to 30 days)
20
- fig.add_vrect(x0=0, x1=30, fillcolor="LightGray", opacity=0.3,
21
- layer="below", line_width=0)
22
- for day in [10, 20]:
23
- fig.add_vline(x=day, line=dict(color="white", width=2, dash="dot"),
24
- layer="below")
25
-
26
- df = tidy_df.copy()
27
-
28
- # Apply participant filter
29
- if astronaut_filter is None:
30
- pass # show all
31
- elif isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
32
- if "sex" in df.columns:
33
- df = df[df["sex"] == astronaut_filter]
34
- elif isinstance(astronaut_filter, (list, tuple, set)):
35
- df = df[df["astronautID"].isin(astronaut_filter)]
36
-
37
- # Loop analytes requested
38
- for analyte in analytes:
39
- subdf = df[df["analyte"] == analyte]
40
- if subdf.empty:
41
- print(f"[make_figure] Skipping {analyte} – no data")
42
- continue
43
-
44
- ## Y-axis scaling
45
- ref_min = subdf["min"].dropna().min()
46
- ref_max = subdf["max"].dropna().max()
47
- data_min = subdf["value"].min()
48
- data_max = subdf["value"].max()
49
-
50
- if "unit" in subdf.columns and not subdf["unit"].dropna().empty:
51
- unit = subdf["unit"].dropna().iloc[0]
52
- y_label = f"{analyte.title()} ({unit})"
53
- else:
54
- y_label = analyte.title()
55
-
56
- ## Add healthy range lines from min / max
57
- if pd.notna(ref_min):
58
- fig.add_hline(
59
- y=ref_min,
60
- line=dict(color="green", width=2, dash="dot"),
61
- annotation_text="Min",
62
- annotation_position="bottom right"
63
- )
64
- if pd.notna(ref_max):
65
- fig.add_hline(
66
- y=ref_max,
67
- line=dict(color="green", width=2, dash="dot"),
68
- annotation_text="Max",
69
- annotation_position="top right"
70
- )
71
-
72
- ## Decide axis limits: must include BOTH healthy range and all data
73
- low_candidates = [v for v in [ref_min, data_min] if pd.notna(v)]
74
- high_candidates = [v for v in [ref_max, data_max] if pd.notna(v)]
75
-
76
- if low_candidates and high_candidates:
77
- low = min(low_candidates)
78
- high = max(high_candidates)
79
- span = high - low if high > low else 1
80
- padding = 0.1 * span
81
- y_range = [low - padding, high + padding]
82
- else:
83
- y_range = None
84
-
85
- ## Apply axis update once
86
- if y_range:
87
- fig.update_yaxes(title=y_label, range=y_range)
88
- else:
89
- fig.update_yaxes(title=y_label)
90
-
91
- ## Plot each astronaut trace - first colors
92
- palette = px.colors.qualitative.Set2
93
- astronaut_colors = {astr: palette[i % len(palette)]
94
- for i, astr in enumerate(subdf["astronautID"].unique())}
95
-
96
- ## Plot each astronaut trace
97
- for astronaut, adf in subdf.groupby("astronautID"):
98
- if adf.empty:
99
- continue
100
- adf = adf.sort_values("flight_day")
101
- base_color = astronaut_colors[astronaut]
102
-
103
- ### Skip if astronaut not in filter
104
- if isinstance(astronaut_filter, (list, tuple, set)) and astronaut not in astronaut_filter:
105
- continue
106
-
107
- # Main Scatter Plot
108
- fig.add_trace(go.Scatter(
109
- x=adf["flight_day"],
110
- y=adf["value"],
111
- mode="lines+markers",
112
- name=f"{astronaut} ({analyte})",
113
- hovertext=adf["timepoint"],
114
- hovertemplate="Day %{hovertext}<br>Value %{y}<extra></extra>",
115
- line=dict(color=base_color),
116
- marker=dict(color=base_color)
117
- ))
118
-
119
- ### Within-astronaut error band
120
- if show_error == "within" and not stats_df.empty:
121
- stat_rows = stats_df[
122
- (stats_df["analyte"] == analyte)
123
- & (stats_df["test_type"] == "within")
124
- ]
125
-
126
- for _, row in stat_rows.iterrows():
127
- astronaut = row["astronautID"]
128
- if astronaut not in subdf["astronautID"].unique():
129
- continue # skip astronauts not in this analyte subset
130
-
131
- mean_L = row.get("mean_L", np.nan)
132
- se = row.get("se_L", np.nan)
133
- R1 = row.get("R1", np.nan)
134
-
135
- if pd.isna(mean_L) or pd.isna(se):
136
- continue
137
-
138
- base_color = astronaut_colors.get(astronaut, "gray")
139
- if base_color.startswith("rgb"):
140
- fill_color = base_color.replace("rgb", "rgba").replace(")", ",0.15)")
141
- else:
142
- fill_color = base_color
143
-
144
- #### Horizontal band: L +/- SE
145
- fig.add_hrect(
146
- y0=mean_L - se, y1=mean_L + se,
147
- fillcolor=fill_color,
148
- opacity=0.2,
149
- line_width=0,
150
- layer="below"
151
- )
152
-
153
- #### Asterisk if R+1 outside band
154
- if pd.notna(R1) and (R1 < mean_L - se or R1 > mean_L + se):
155
- fig.add_annotation(
156
- x=31,
157
- y=R1,
158
- text="*",
159
- showarrow=False,
160
- font=dict(size=20, color="red"),
161
- yshift=15
162
- )
163
-
164
- ## Group-level error band
165
- if show_error == "group" and not stats_df.empty:
166
- stat_rows = stats_df[
167
- (stats_df["analyte"] == analyte)
168
- & (stats_df["test_type"] == "group")
169
- ]
170
-
171
- for _, row in stat_rows.iterrows():
172
- mean_L = row.get("mean_L", np.nan)
173
- n = row.get("n_L", 0)
174
-
175
- error = np.nan
176
- if pd.notna(row.get("effect_size")) and n > 1 and row["effect_size"] != 0:
177
- error = abs(row.get("R1", np.nan) - mean_L) / abs(row["effect_size"])
178
- if pd.isna(error):
179
- error = 0
180
-
181
- #### Filter bands only if stats_df has group info
182
- should_plot = True
183
- if "group" in row.index and astronaut_filter is not None:
184
- group_id = row["group"]
185
-
186
- if isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
187
- should_plot = (group_id == astronaut_filter)
188
- elif isinstance(astronaut_filter, (list, tuple, set)):
189
- # Only show if group_id matches one of the selected astronauts
190
- should_plot = (group_id in astronaut_filter)
191
-
192
- if should_plot and pd.notna(mean_L):
193
- fig.add_hrect(
194
- y0=mean_L - error, y1=mean_L + error,
195
- fillcolor="gray", opacity=0.2,
196
- layer="below", line_width=0,
197
- annotation_text = "Group Error Band",
198
- annotation_position="top left"
199
- )
200
-
201
- if row.get("p_value") is not None and row["p_value"] < 0.05:
202
- fig.add_annotation(
203
- x=31, # R+1 = 31
204
- y=row.get("R1", mean_L),
205
- text="*",
206
- showarrow=False,
207
- font=dict(size=20, color="red"),
208
- yshift=15
209
- )
210
-
211
- ## Only update range if ref_min/ref_max are valid
212
- if pd.notna(ref_min) and pd.notna(ref_max):
213
- fig.update_yaxes(title=y_label,
214
- range=[ref_min * 0.9, ref_max * 1.1])
215
- else:
216
- fig.update_yaxes(title=y_label)
217
-
218
- # Layout: Build Dynamic Title
219
- if astronaut_filter is None:
220
- group_label = "All Participants"
221
- elif isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
222
- group_label = f"{astronaut_filter} Participants"
223
- elif isinstance(astronaut_filter, (list, tuple, set)):
224
- group_label = "Subset: " + ", ".join(astronaut_filter)
225
- else:
226
- group_label = "Participants"
227
-
228
- # Build analyte label with units if available
229
- ana_label = ", ".join(analytes)
230
- unit_label = ""
231
- subdf = df[df["analyte"] == analytes[0]]
232
- if "unit" in subdf.columns and not subdf["unit"].dropna().empty:
233
- unit_label = f" ({subdf['unit'].dropna().iloc[0]})"
234
-
235
- fig.update_layout(
236
- title=f"{ana_label.title()}{unit_label} Trends ({group_label})",
237
- xaxis_title="Mission Day",
238
- legend_title="Participant / Analyte",
239
- hovermode="x unified",
240
- template="plotly_white",
241
- margin=dict(l=60, r=30, t=60, b=60)
242
- )
243
-
244
- # Custom ticks
245
- ticks = [t for t in sorted(df["flight_day"].dropna().unique()) if pd.notna(t)]
246
- ticktext = []
247
- for t in ticks:
248
- if t >= 30:
249
- lbl = f"R+{int(t-30)}"
250
- else:
251
- lbl = f"L{int(t)}"
252
- ticktext.append(lbl)
253
- if ticks:
254
- fig.update_xaxes(tickmode="array", tickvals=ticks, ticktext=ticktext)
255
-
256
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/stats.py DELETED
@@ -1,205 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from scipy import stats
4
- from .featureEngineering import parse_timepoint
5
-
6
- # Map analyte base names to human labels + units + reference ranges
7
- ## To get sub and superscripts in Markdown I used ChatGPT: https://chatgpt.com/share/68d9c8f6-2674-8008-8ff7-0731bec9ad49
8
- ANALYTE_INFO = {
9
- #Blood Chemistry
10
- "albumin": {"label": "Albumin", "unit": "g/dL"},
11
- "alkaline_phosphatase": {"label": "Alkaline Phosphatase", "unit": "U/L"},
12
- "alt": {"label": "ALT", "unit": "U/L"},
13
- "ast": {"label": "AST", "unit": "U/L"},
14
- "total_bilirubin": {"label": "Bilirubin", "unit": "mg/dL"},
15
- "bun_to_creatinine_ratio": {"label": "BUN/Creatinine Ratio", "unit": ""},
16
- "calcium": {"label": "Ca²⁺", "unit": "mg/dL"},
17
- "carbon_dioxide": {"label": "CO₂", "unit": "mmol/L"},
18
- "chloride": {"label": "Cl⁻", "unit": "mmol/L"},
19
- "creatinine": {"label": "Creatinine", "unit": "mg/dL"},
20
- "egfr_african_american": {"label": "eGFR (AA)", "unit": "mL/min/1.73m²"},
21
- "egfr_non_african_american": {"label": "eGFR (non-AA)", "unit": "mL/min/1.73m²"},
22
- "globulin": {"label": "Globulin", "unit": "g/dL"},
23
- "glucose": {"label": "Glucose", "unit": "mg/dL"},
24
- "potassium": {"label": "K⁺", "unit": "mmol/L"},
25
- "total_protein": {"label": "Protein", "unit": "g/dL"},
26
- "sodium": {"label": "Na⁺", "unit": "mmol/L"},
27
- "urea_nitrogen_bun": {"label": "BUN", "unit": "mg/dL"},
28
-
29
- # Derived feature
30
- "anion_gap": {
31
- "label": "Anion Gap",
32
- "unit": "mmol/L",
33
- "min": 8, # manual reference range
34
- "max": 24
35
- },
36
-
37
- ## cardiovascular
38
- ## Cardiovascular
39
- "a2_macroglobulin": {"label": "α₂-Macroglobulin", "unit": "ng/mL"},
40
- "agp": {"label": "AGP (α1-acid glycoprotein)", "unit": "ng/mL"},
41
- "crp": {"label": "CRP (C-reactive protein)", "unit": "pg/mL"},
42
- "fetuin_a36": {"label": "Fetuin A3/6", "unit": "ng/mL"},
43
- "fibrinogen": {"label": "Fibrinogen", "unit": "ng/mL"},
44
- "haptoglobin": {"label": "Haptoglobin", "unit": "ng/mL"},
45
- "l_selectin": {"label": "L-Selectin", "unit": "pg/mL"},
46
- "pf4": {"label": "Platelet Factor 4", "unit": "ng/mL"},
47
- "sap": {"label": "SAP (Serum Amyloid P)", "unit": "pg/mL"},
48
- }
49
-
50
- # Helpers to find columns by prefix (robust to unit suffixes)
51
- def _first_col_startswith(df: pd.DataFrame, prefixes) -> str | None:
52
- """
53
- Return the first column whose lowercase name starts with any prefix in `prefixes`.
54
- """
55
- if isinstance(prefixes, str):
56
- prefixes = [prefixes]
57
- prefixes = [p.lower() for p in prefixes]
58
- for col in df.columns:
59
- cl = col.lower()
60
- for p in prefixes:
61
- if cl.startswith(p):
62
- return col
63
- return None
64
-
65
-
66
- def _value_min_max_cols(df: pd.DataFrame, analyte: str):
67
- """
68
- For a given base analyte name, return (value_col, min_col, max_col).
69
- Works with clinical chemistry (…_value) and cardiovascular (…_concentration / …_percent).
70
- """
71
- v = _first_col_startswith(df, f"{analyte}_value")
72
- if v is None:
73
- v = _first_col_startswith(df, f"{analyte}_concentration")
74
-
75
- mn = _first_col_startswith(df, [f"{analyte}_range_min", f"{analyte}_min"])
76
- mx = _first_col_startswith(df, [f"{analyte}_range_max", f"{analyte}_max"])
77
-
78
- return v, mn, mx
79
-
80
- # Tidy Transformation
81
- def tidy_from_wide(df: pd.DataFrame) -> pd.DataFrame:
82
- """
83
- Transform astronaut CSV with value/min/max triplets into tidy format.
84
- Adds derived analytes (like Anion Gap) using flexible column matching.
85
- Returns: columns [astronautID, timepoint, flight_day, analyte, value, min, max, unit, label, sex]
86
- """
87
- tidy_records = []
88
-
89
- # normalize lookup for id/timepoint columns
90
- colmap = {c.lower(): c for c in df.columns}
91
- astronaut_col = colmap.get("astronautid")
92
- timepoint_col = colmap.get("timepoint")
93
-
94
- if astronaut_col is None or timepoint_col is None:
95
- raise KeyError("Expected astronautID and timepoint columns in input CSV")
96
-
97
- for analyte, meta in ANALYTE_INFO.items():
98
- if analyte == "anion_gap":
99
- continue
100
-
101
- value_col, min_col, max_col = _value_min_max_cols(df, analyte)
102
- if value_col is None:
103
- continue
104
-
105
- for _, row in df.iterrows():
106
- rec = {
107
- "astronautID": row[astronaut_col],
108
- "timepoint": row[timepoint_col],
109
- "flight_day": parse_timepoint(row[timepoint_col]),
110
- "analyte": analyte,
111
- "value": row[value_col],
112
- "min": (row[min_col] if (min_col and pd.notna(row[min_col])) else meta.get("min")),
113
- "max": (row[max_col] if (max_col and pd.notna(row[max_col])) else meta.get("max")),
114
- "label": meta["label"],
115
- "unit": meta["unit"],
116
- "sex": "Male" if str(row[astronaut_col]) in ["C001", "C004"] else "Female",
117
- }
118
- tidy_records.append(rec)
119
-
120
- return pd.DataFrame(tidy_records)
121
-
122
- # Statistical Comparison: R+1 vs L-series
123
- def analyze_r1_vs_L(tidy: pd.DataFrame) -> pd.DataFrame:
124
- """
125
- Compare R+1 vs L-series for each analyte.
126
- - Within-astronaut: one-sample t-test (H0: mean(L) == R+1)
127
- Returns per-astronaut mean, std, SE, t-stat, p-value, and Cohen's d.
128
- - Across-astronauts (group-level): paired t-test on per-astronaut mean(L) vs R+1
129
- Returns group mean, std across astronauts, SEM, t-stat, p-value, and Cohen's d.
130
- """
131
- results = []
132
- for analyte, subdf in tidy.groupby("analyte"):
133
-
134
- ## Within-astronaut tests
135
- for astronaut, adf in subdf.groupby("astronautID"):
136
- L_mask = adf["timepoint"].astype(str).str.startswith("L")
137
- R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])
138
-
139
- L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
140
- R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)
141
-
142
- if len(L_vals) >= 2 and len(R1_vals) == 1:
143
- R1 = float(R1_vals.iloc[0])
144
- mean_L = float(L_vals.mean())
145
- std_L = float(L_vals.std(ddof=1))
146
- n_L = int(L_vals.shape[0])
147
-
148
- if std_L > 0:
149
- se = std_L / np.sqrt(n_L)
150
- t_stat = (mean_L - R1) / se
151
- p_val = 2 * (1 - stats.t.cdf(abs(t_stat), df=n_L - 1))
152
- cohen_d = (R1 - mean_L) / std_L
153
- else:
154
- se = t_stat = p_val = cohen_d = np.nan
155
-
156
- results.append({
157
- "analyte": analyte,
158
- "astronautID": astronaut,
159
- "test_type": "within",
160
- "n_L": n_L,
161
- "mean_L": round(mean_L, 2),
162
- "R1": round(R1, 2),
163
- "std_L": round(std_L, 2),
164
- "se_L": round(se, 2) if pd.notna(se) else np.nan,
165
- "t_stat": round(t_stat, 3) if pd.notna(t_stat) else np.nan,
166
- "p_value": round(p_val, 4) if pd.notna(p_val) else np.nan,
167
- "effect_size": round(cohen_d, 3) if pd.notna(cohen_d) else np.nan,
168
- })
169
-
170
- ## Across-astronauts (paired test)
171
- astronaut_means, astronaut_R1 = [], []
172
- for astronaut, adf in subdf.groupby("astronautID"):
173
- L_mask = adf["timepoint"].astype(str).str.startswith("L")
174
- R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])
175
-
176
- L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
177
- R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)
178
-
179
- if len(L_vals) >= 2 and len(R1_vals) == 1:
180
- astronaut_means.append(float(L_vals.mean()))
181
- astronaut_R1.append(float(R1_vals.iloc[0]))
182
-
183
- if len(astronaut_means) >= 2:
184
- diffs = np.array(astronaut_R1) - np.array(astronaut_means)
185
- t_stat, p_val = stats.ttest_rel(astronaut_R1, astronaut_means)
186
-
187
- # Group-level variability
188
- std_L = np.std(astronaut_means, ddof=1)
189
- se_L = std_L / np.sqrt(len(astronaut_means))
190
-
191
- cohen_d = diffs.mean() / diffs.std(ddof=1) if diffs.std(ddof=1) > 0 else np.nan
192
-
193
- results.append({
194
- "analyte": analyte,
195
- "astronautID": "ALL",
196
- "test_type": "group",
197
- "n_L": len(astronaut_means),
198
- "mean_L": round(float(np.mean(astronaut_means)), 2),
199
- "R1": round(float(np.mean(astronaut_R1)), 2),
200
- "t_stat": round(float(t_stat), 3),
201
- "p_value": round(float(p_val), 4),
202
- "effect_size": round(float(cohen_d), 3) if pd.notna(cohen_d) else np.nan,
203
- })
204
-
205
- return pd.DataFrame(results)