Spaces:

DTanzillo
/

Inspiration_Health_Data

No application file

App Files Files Community

DTanzillo commited on Sep 29, 2025

Commit

77a7dbf

verified ·

1 Parent(s): e55c60e

Delete scripts

Browse files

Files changed (3) hide show

scripts/featureEngineering.py +0 -63
scripts/graphMaking.py +0 -256
scripts/stats.py +0 -205

scripts/featureEngineering.py DELETED Viewed

@@ -1,63 +0,0 @@
-import numpy as np
-import pandas as pd
-def parse_timepoint(timepoint: str) -> int:
-    """
-    Convert timepoint strings like 'L-3', 'L0', 'R+0', 'R+1' into numeric flight days
-    on a stretched scale.
-    In particular, we are converting the 3 dats of flight into 30 days so there is a
-    difference, the final chart will have fake data in it.
-    Convention:
-        L-0 ->   0   (launch day = Flight Day 0)
-        L-3 ->  -3   (3 days before launch)
-        R+0 ->  30   (last day in space, stretched to day 30)
-        R+1 ->  31   (first recovery day)
-        R+N ->  N+30 (general rule for post-launch days)
-    """
-    label = str(timepoint).strip().upper()
-    if label.startswith("L"):  # Pre-launch
-        number = int(label.replace("L", "").replace("+", "").replace("-", "") or "0")
-        return -number
-    elif label.startswith("R"):  # Return / post-flight
-        number = int(label.replace("R", "").replace("+", "").replace("-", "") or "0")
-        return number + 30
-    return np.nan
-def add_flight_day(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Add a 'flight_day' column to a dataframe that already has 'timepoint' and 'astronautID'.
-    Drops 'Sample Name' if present, since it's redundant.
-    """
-    df = df.copy()
-    if "timepoint" not in df.columns:
-        raise ValueError("DataFrame must contain a 'timepoint' column")
-    # create numeric scale
-    df["flight_day"] = df["timepoint"].apply(parse_timepoint)
-    # drop redundant 'Sample Name' if it exists
-    if "Sample Name" in df.columns:
-        df = df.drop(columns=["Sample Name"])
-    return df
-def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Add derived feature: Anion Gap.
-    Anion Gap = Sodium − Chloride − Carbon Dioxide
-    """
-    df = df.copy()
-    if all(c in df.columns for c in ["sodium_value", "chloride_value", "carbon_dioxide_value"]):
-        df["anion_gap_value"] = (
-            df["sodium_value"].astype(float)
-            - df["chloride_value"].astype(float)
-            - df["carbon_dioxide_value"].astype(float)
-        )
-        # Placeholders; min/max defined manually in stats.ANALYTE_INFO
-        df["anion_gap_range_min"] = np.nan
-        df["anion_gap_range_max"] = np.nan
-    return df

scripts/graphMaking.py DELETED Viewed

@@ -1,256 +0,0 @@
-import plotly.graph_objects as go
-import plotly.express as px
-import numpy as np
-import pandas as pd
-def make_figure(
-    tidy_df: pd.DataFrame,
-    stats_df: pd.DataFrame,
-    analytes: list,
-    astronaut_filter=None,
-    show_error: str = None
-):
-    """
-    Build interactive mission-day plots with stats overlays.
-    """
-    fig = go.Figure()
-    # Highlight stretched space interval (0 to 30 days)
-    fig.add_vrect(x0=0, x1=30, fillcolor="LightGray", opacity=0.3,
-                  layer="below", line_width=0)
-    for day in [10, 20]:
-        fig.add_vline(x=day, line=dict(color="white", width=2, dash="dot"),
-                      layer="below")
-    df = tidy_df.copy()
-    # Apply participant filter
-    if astronaut_filter is None:
-        pass  # show all
-    elif isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
-        if "sex" in df.columns:
-            df = df[df["sex"] == astronaut_filter]
-    elif isinstance(astronaut_filter, (list, tuple, set)):
-        df = df[df["astronautID"].isin(astronaut_filter)]
-    # Loop analytes requested
-    for analyte in analytes:
-        subdf = df[df["analyte"] == analyte]
-        if subdf.empty:
-            print(f"[make_figure] Skipping {analyte} – no data")
-            continue
-        ## Y-axis scaling
-        ref_min = subdf["min"].dropna().min()
-        ref_max = subdf["max"].dropna().max()
-        data_min = subdf["value"].min()
-        data_max = subdf["value"].max()
-        if "unit" in subdf.columns and not subdf["unit"].dropna().empty:
-            unit = subdf["unit"].dropna().iloc[0]
-            y_label = f"{analyte.title()} ({unit})"
-        else:
-            y_label = analyte.title()
-        ## Add healthy range lines from min / max
-        if pd.notna(ref_min):
-            fig.add_hline(
-                y=ref_min,
-                line=dict(color="green", width=2, dash="dot"),
-                annotation_text="Min",
-                annotation_position="bottom right"
-            )
-        if pd.notna(ref_max):
-            fig.add_hline(
-                y=ref_max,
-                line=dict(color="green", width=2, dash="dot"),
-                annotation_text="Max",
-                annotation_position="top right"
-            )
-        ## Decide axis limits: must include BOTH healthy range and all data
-        low_candidates = [v for v in [ref_min, data_min] if pd.notna(v)]
-        high_candidates = [v for v in [ref_max, data_max] if pd.notna(v)]
-        if low_candidates and high_candidates:
-            low = min(low_candidates)
-            high = max(high_candidates)
-            span = high - low if high > low else 1
-            padding = 0.1 * span
-            y_range = [low - padding, high + padding]
-        else:
-            y_range = None
-        ## Apply axis update once
-        if y_range:
-            fig.update_yaxes(title=y_label, range=y_range)
-        else:
-            fig.update_yaxes(title=y_label)
-        ## Plot each astronaut trace - first colors
-        palette = px.colors.qualitative.Set2
-        astronaut_colors = {astr: palette[i % len(palette)]
-                            for i, astr in enumerate(subdf["astronautID"].unique())}
-        ## Plot each astronaut trace
-        for astronaut, adf in subdf.groupby("astronautID"):
-            if adf.empty:
-                continue
-            adf = adf.sort_values("flight_day")
-            base_color = astronaut_colors[astronaut]
-            ### Skip if astronaut not in filter
-            if isinstance(astronaut_filter, (list, tuple, set)) and astronaut not in astronaut_filter:
-                continue
-            # Main Scatter Plot
-            fig.add_trace(go.Scatter(
-                x=adf["flight_day"],
-                y=adf["value"],
-                mode="lines+markers",
-                name=f"{astronaut} ({analyte})",
-                hovertext=adf["timepoint"],
-                hovertemplate="Day %{hovertext}<br>Value %{y}<extra></extra>",
-                line=dict(color=base_color),
-                marker=dict(color=base_color)
-            ))
-            ### Within-astronaut error band
-            if show_error == "within" and not stats_df.empty:
-                stat_rows = stats_df[
-                    (stats_df["analyte"] == analyte)
-                    & (stats_df["test_type"] == "within")
-                    ]
-                for _, row in stat_rows.iterrows():
-                    astronaut = row["astronautID"]
-                    if astronaut not in subdf["astronautID"].unique():
-                        continue  # skip astronauts not in this analyte subset
-                    mean_L = row.get("mean_L", np.nan)
-                    se = row.get("se_L", np.nan)
-                    R1 = row.get("R1", np.nan)
-                    if pd.isna(mean_L) or pd.isna(se):
-                        continue
-                    base_color = astronaut_colors.get(astronaut, "gray")
-                    if base_color.startswith("rgb"):
-                        fill_color = base_color.replace("rgb", "rgba").replace(")", ",0.15)")
-                    else:
-                        fill_color = base_color
-                    #### Horizontal band: L +/- SE
-                    fig.add_hrect(
-                        y0=mean_L - se, y1=mean_L + se,
-                        fillcolor=fill_color,
-                        opacity=0.2,
-                        line_width=0,
-                        layer="below"
-                    )
-                    #### Asterisk if R+1 outside band
-                    if pd.notna(R1) and (R1 < mean_L - se or R1 > mean_L + se):
-                        fig.add_annotation(
-                            x=31,
-                            y=R1,
-                            text="*",
-                            showarrow=False,
-                            font=dict(size=20, color="red"),
-                            yshift=15
-                        )
-        ## Group-level error band
-        if show_error == "group" and not stats_df.empty:
-            stat_rows = stats_df[
-                (stats_df["analyte"] == analyte)
-                & (stats_df["test_type"] == "group")
-                ]
-            for _, row in stat_rows.iterrows():
-                mean_L = row.get("mean_L", np.nan)
-                n = row.get("n_L", 0)
-                error = np.nan
-                if pd.notna(row.get("effect_size")) and n > 1 and row["effect_size"] != 0:
-                    error = abs(row.get("R1", np.nan) - mean_L) / abs(row["effect_size"])
-                if pd.isna(error):
-                    error = 0
-                #### Filter bands only if stats_df has group info
-                should_plot = True
-                if "group" in row.index and astronaut_filter is not None:
-                    group_id = row["group"]
-                    if isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
-                        should_plot = (group_id == astronaut_filter)
-                    elif isinstance(astronaut_filter, (list, tuple, set)):
-                        # Only show if group_id matches one of the selected astronauts
-                        should_plot = (group_id in astronaut_filter)
-                if should_plot and pd.notna(mean_L):
-                    fig.add_hrect(
-                        y0=mean_L - error, y1=mean_L + error,
-                        fillcolor="gray", opacity=0.2,
-                        layer="below", line_width=0,
-                        annotation_text = "Group Error Band",
-                        annotation_position="top left"
-                    )
-                    if row.get("p_value") is not None and row["p_value"] < 0.05:
-                        fig.add_annotation(
-                            x=31,  # R+1 = 31
-                            y=row.get("R1", mean_L),
-                            text="*",
-                            showarrow=False,
-                            font=dict(size=20, color="red"),
-                            yshift=15
-                        )
-        ## Only update range if ref_min/ref_max are valid
-        if pd.notna(ref_min) and pd.notna(ref_max):
-            fig.update_yaxes(title=y_label,
-                             range=[ref_min * 0.9, ref_max * 1.1])
-        else:
-            fig.update_yaxes(title=y_label)
-    # Layout: Build Dynamic Title
-    if astronaut_filter is None:
-        group_label = "All Participants"
-    elif isinstance(astronaut_filter, str) and astronaut_filter in ["Male", "Female"]:
-        group_label = f"{astronaut_filter} Participants"
-    elif isinstance(astronaut_filter, (list, tuple, set)):
-        group_label = "Subset: " + ", ".join(astronaut_filter)
-    else:
-        group_label = "Participants"
-    # Build analyte label with units if available
-    ana_label = ", ".join(analytes)
-    unit_label = ""
-    subdf = df[df["analyte"] == analytes[0]]
-    if "unit" in subdf.columns and not subdf["unit"].dropna().empty:
-        unit_label = f" ({subdf['unit'].dropna().iloc[0]})"
-    fig.update_layout(
-        title=f"{ana_label.title()}{unit_label} Trends ({group_label})",
-        xaxis_title="Mission Day",
-        legend_title="Participant / Analyte",
-        hovermode="x unified",
-        template="plotly_white",
-        margin=dict(l=60, r=30, t=60, b=60)
-    )
-    # Custom ticks
-    ticks = [t for t in sorted(df["flight_day"].dropna().unique()) if pd.notna(t)]
-    ticktext = []
-    for t in ticks:
-        if t >= 30:
-            lbl = f"R+{int(t-30)}"
-        else:
-            lbl = f"L{int(t)}"
-        ticktext.append(lbl)
-    if ticks:
-        fig.update_xaxes(tickmode="array", tickvals=ticks, ticktext=ticktext)
-    return fig

scripts/stats.py DELETED Viewed

@@ -1,205 +0,0 @@
-import pandas as pd
-import numpy as np
-from scipy import stats
-from .featureEngineering import parse_timepoint
-# Map analyte base names to human labels + units + reference ranges
-## To get sub and superscripts in Markdown I used ChatGPT: https://chatgpt.com/share/68d9c8f6-2674-8008-8ff7-0731bec9ad49
-ANALYTE_INFO = {
-    #Blood Chemistry
-    "albumin": {"label": "Albumin", "unit": "g/dL"},
-    "alkaline_phosphatase": {"label": "Alkaline Phosphatase", "unit": "U/L"},
-    "alt": {"label": "ALT", "unit": "U/L"},
-    "ast": {"label": "AST", "unit": "U/L"},
-    "total_bilirubin": {"label": "Bilirubin", "unit": "mg/dL"},
-    "bun_to_creatinine_ratio": {"label": "BUN/Creatinine Ratio", "unit": ""},
-    "calcium": {"label": "Ca²⁺", "unit": "mg/dL"},
-    "carbon_dioxide": {"label": "CO₂", "unit": "mmol/L"},
-    "chloride": {"label": "Cl⁻", "unit": "mmol/L"},
-    "creatinine": {"label": "Creatinine", "unit": "mg/dL"},
-    "egfr_african_american": {"label": "eGFR (AA)", "unit": "mL/min/1.73m²"},
-    "egfr_non_african_american": {"label": "eGFR (non-AA)", "unit": "mL/min/1.73m²"},
-    "globulin": {"label": "Globulin", "unit": "g/dL"},
-    "glucose": {"label": "Glucose", "unit": "mg/dL"},
-    "potassium": {"label": "K⁺", "unit": "mmol/L"},
-    "total_protein": {"label": "Protein", "unit": "g/dL"},
-    "sodium": {"label": "Na⁺", "unit": "mmol/L"},
-    "urea_nitrogen_bun": {"label": "BUN", "unit": "mg/dL"},
-    # Derived feature
-    "anion_gap": {
-        "label": "Anion Gap",
-        "unit": "mmol/L",
-        "min": 8,   # manual reference range
-        "max": 24
-    },
-    ## cardiovascular
-    ## Cardiovascular
-    "a2_macroglobulin": {"label": "α₂-Macroglobulin", "unit": "ng/mL"},
-    "agp": {"label": "AGP (α1-acid glycoprotein)", "unit": "ng/mL"},
-    "crp": {"label": "CRP (C-reactive protein)", "unit": "pg/mL"},
-    "fetuin_a36": {"label": "Fetuin A3/6", "unit": "ng/mL"},
-    "fibrinogen": {"label": "Fibrinogen", "unit": "ng/mL"},
-    "haptoglobin": {"label": "Haptoglobin", "unit": "ng/mL"},
-    "l_selectin": {"label": "L-Selectin", "unit": "pg/mL"},
-    "pf4": {"label": "Platelet Factor 4", "unit": "ng/mL"},
-    "sap": {"label": "SAP (Serum Amyloid P)", "unit": "pg/mL"},
-}
-# Helpers to find columns by prefix (robust to unit suffixes)
-def _first_col_startswith(df: pd.DataFrame, prefixes) -> str | None:
-    """
-    Return the first column whose lowercase name starts with any prefix in `prefixes`.
-    """
-    if isinstance(prefixes, str):
-        prefixes = [prefixes]
-    prefixes = [p.lower() for p in prefixes]
-    for col in df.columns:
-        cl = col.lower()
-        for p in prefixes:
-            if cl.startswith(p):
-                return col
-    return None
-def _value_min_max_cols(df: pd.DataFrame, analyte: str):
-    """
-    For a given base analyte name, return (value_col, min_col, max_col).
-    Works with clinical chemistry (…_value) and cardiovascular (…_concentration / …_percent).
-    """
-    v = _first_col_startswith(df, f"{analyte}_value")
-    if v is None:
-        v = _first_col_startswith(df, f"{analyte}_concentration")
-    mn = _first_col_startswith(df, [f"{analyte}_range_min", f"{analyte}_min"])
-    mx = _first_col_startswith(df, [f"{analyte}_range_max", f"{analyte}_max"])
-    return v, mn, mx
-# Tidy Transformation
-def tidy_from_wide(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Transform astronaut CSV with value/min/max triplets into tidy format.
-    Adds derived analytes (like Anion Gap) using flexible column matching.
-    Returns: columns [astronautID, timepoint, flight_day, analyte, value, min, max, unit, label, sex]
-    """
-    tidy_records = []
-    # normalize lookup for id/timepoint columns
-    colmap = {c.lower(): c for c in df.columns}
-    astronaut_col = colmap.get("astronautid")
-    timepoint_col = colmap.get("timepoint")
-    if astronaut_col is None or timepoint_col is None:
-        raise KeyError("Expected astronautID and timepoint columns in input CSV")
-    for analyte, meta in ANALYTE_INFO.items():
-        if analyte == "anion_gap":
-            continue
-        value_col, min_col, max_col = _value_min_max_cols(df, analyte)
-        if value_col is None:
-            continue
-        for _, row in df.iterrows():
-            rec = {
-                "astronautID": row[astronaut_col],
-                "timepoint": row[timepoint_col],
-                "flight_day": parse_timepoint(row[timepoint_col]),
-                "analyte": analyte,
-                "value": row[value_col],
-                "min": (row[min_col] if (min_col and pd.notna(row[min_col])) else meta.get("min")),
-                "max": (row[max_col] if (max_col and pd.notna(row[max_col])) else meta.get("max")),
-                "label": meta["label"],
-                "unit": meta["unit"],
-                "sex": "Male" if str(row[astronaut_col]) in ["C001", "C004"] else "Female",
-            }
-            tidy_records.append(rec)
-    return pd.DataFrame(tidy_records)
-# Statistical Comparison: R+1 vs L-series
-def analyze_r1_vs_L(tidy: pd.DataFrame) -> pd.DataFrame:
-    """
-    Compare R+1 vs L-series for each analyte.
-    - Within-astronaut: one-sample t-test (H0: mean(L) == R+1)
-      Returns per-astronaut mean, std, SE, t-stat, p-value, and Cohen's d.
-    - Across-astronauts (group-level): paired t-test on per-astronaut mean(L) vs R+1
-      Returns group mean, std across astronauts, SEM, t-stat, p-value, and Cohen's d.
-    """
-    results = []
-    for analyte, subdf in tidy.groupby("analyte"):
-        ## Within-astronaut tests
-        for astronaut, adf in subdf.groupby("astronautID"):
-            L_mask = adf["timepoint"].astype(str).str.startswith("L")
-            R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])
-            L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
-            R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)
-            if len(L_vals) >= 2 and len(R1_vals) == 1:
-                R1 = float(R1_vals.iloc[0])
-                mean_L = float(L_vals.mean())
-                std_L = float(L_vals.std(ddof=1))
-                n_L = int(L_vals.shape[0])
-                if std_L > 0:
-                    se = std_L / np.sqrt(n_L)
-                    t_stat = (mean_L - R1) / se
-                    p_val = 2 * (1 - stats.t.cdf(abs(t_stat), df=n_L - 1))
-                    cohen_d = (R1 - mean_L) / std_L
-                else:
-                    se = t_stat = p_val = cohen_d = np.nan
-                results.append({
-                    "analyte": analyte,
-                    "astronautID": astronaut,
-                    "test_type": "within",
-                    "n_L": n_L,
-                    "mean_L": round(mean_L, 2),
-                    "R1": round(R1, 2),
-                    "std_L": round(std_L, 2),
-                    "se_L": round(se, 2) if pd.notna(se) else np.nan,
-                    "t_stat": round(t_stat, 3) if pd.notna(t_stat) else np.nan,
-                    "p_value": round(p_val, 4) if pd.notna(p_val) else np.nan,
-                    "effect_size": round(cohen_d, 3) if pd.notna(cohen_d) else np.nan,
-                })
-        ## Across-astronauts (paired test)
-        astronaut_means, astronaut_R1 = [], []
-        for astronaut, adf in subdf.groupby("astronautID"):
-            L_mask = adf["timepoint"].astype(str).str.startswith("L")
-            R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])
-            L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
-            R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)
-            if len(L_vals) >= 2 and len(R1_vals) == 1:
-                astronaut_means.append(float(L_vals.mean()))
-                astronaut_R1.append(float(R1_vals.iloc[0]))
-        if len(astronaut_means) >= 2:
-            diffs = np.array(astronaut_R1) - np.array(astronaut_means)
-            t_stat, p_val = stats.ttest_rel(astronaut_R1, astronaut_means)
-            # Group-level variability
-            std_L = np.std(astronaut_means, ddof=1)
-            se_L = std_L / np.sqrt(len(astronaut_means))
-            cohen_d = diffs.mean() / diffs.std(ddof=1) if diffs.std(ddof=1) > 0 else np.nan
-            results.append({
-                "analyte": analyte,
-                "astronautID": "ALL",
-                "test_type": "group",
-                "n_L": len(astronaut_means),
-                "mean_L": round(float(np.mean(astronaut_means)), 2),
-                "R1": round(float(np.mean(astronaut_R1)), 2),
-                "t_stat": round(float(t_stat), 3),
-                "p_value": round(float(p_val), 4),
-                "effect_size": round(float(cohen_d), 3) if pd.notna(cohen_d) else np.nan,
-            })
-    return pd.DataFrame(results)