Spaces:
Sleeping
Sleeping
| """ | |
| Longitudinal progress tracking for combined corpora (21 children: | |
| Rollins 5 + Flusberg 6 + Quigley 10, each with multiple sessions). | |
| For each child we track the trajectory of core speech-language features | |
| across sessions and fit a simple linear trend. This mirrors the clinical | |
| question raised by the advisor: | |
| "Can AI tell whether the child's speech improves from session to session?" | |
| Outputs: | |
| reports/figures/longitudinal_trajectories.png (per-feature, per-child lines) | |
| reports/figures/longitudinal_composite_score.png (composite progress score) | |
| reports/metrics/longitudinal_trends.csv (slope, r, p per child-feature) | |
| reports/metrics/longitudinal_progress_summary.csv (first vs last session delta) | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| from scipy import stats | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| FIG_DIR = PROJECT_ROOT / "reports" / "figures" | |
| METRIC_DIR = PROJECT_ROOT / "reports" / "metrics" | |
| FIG_DIR.mkdir(parents=True, exist_ok=True) | |
| METRIC_DIR.mkdir(parents=True, exist_ok=True) | |
| sns.set_theme(style="whitegrid", context="talk") | |
| # Features we expect to IMPROVE over therapy. | |
| # +1 = higher is better | -1 = lower is better | |
| FEATURE_DIRECTION = { | |
| "mlu": +1, | |
| "mluw": +1, | |
| "ttr": +1, | |
| "total_words": +1, | |
| "total_utterances": +1, | |
| "unintelligible_ratio": -1, | |
| "zero_vocalization_count": -1, | |
| } | |
| def compute_trends(df: pd.DataFrame) -> pd.DataFrame: | |
| """Fit a linear regression of feature ~ session_order for each child-feature.""" | |
| rows = [] | |
| for child, g in df.groupby("child"): | |
| g = g.sort_values("session_order") | |
| x = g["session_order"].to_numpy(dtype=float) | |
| for feat, direction in FEATURE_DIRECTION.items(): | |
| y = g[feat].to_numpy(dtype=float) | |
| if len(x) < 2 or np.all(np.isnan(y)): | |
| continue | |
| res = stats.linregress(x, y) | |
| improving = (res.slope * direction) > 0 | |
| rows.append({ | |
| "child": child, | |
| "feature": feat, | |
| "direction": "higher=better" if direction > 0 else "lower=better", | |
| "slope": round(res.slope, 4), | |
| "intercept": round(res.intercept, 4), | |
| "r": round(res.rvalue, 4), | |
| "p_value": round(res.pvalue, 4), | |
| "n_sessions": len(x), | |
| "improving": bool(improving), | |
| }) | |
| return pd.DataFrame(rows) | |
| def compute_progress_summary(df: pd.DataFrame) -> pd.DataFrame: | |
| """First vs last session delta per child.""" | |
| rows = [] | |
| for child, g in df.groupby("child"): | |
| g = g.sort_values("session_order") | |
| first, last = g.iloc[0], g.iloc[-1] | |
| row = { | |
| "child": child, | |
| "n_sessions": len(g), | |
| "age_start_mo": first["age_months"], | |
| "age_end_mo": last["age_months"], | |
| "duration_mo": ( | |
| round(last["age_months"] - first["age_months"], 1) | |
| if pd.notna(last["age_months"]) and pd.notna(first["age_months"]) | |
| else np.nan | |
| ), | |
| } | |
| for feat in FEATURE_DIRECTION: | |
| row[f"{feat}_start"] = round(first[feat], 3) | |
| row[f"{feat}_end"] = round(last[feat], 3) | |
| row[f"{feat}_delta"] = round(last[feat] - first[feat], 3) | |
| rows.append(row) | |
| return pd.DataFrame(rows) | |
| def plot_trajectories(df: pd.DataFrame) -> None: | |
| feats = list(FEATURE_DIRECTION.keys()) | |
| fig, axes = plt.subplots(3, 3, figsize=(18, 14)) | |
| # Create child+corpus label for clarity | |
| df = df.copy() | |
| df["child_label"] = df["child"] + " (" + df["corpus"].str[:4] + ")" | |
| for ax, feat in zip(axes.flat, feats): | |
| sns.lineplot( | |
| data=df, x="session_order", y=feat, | |
| hue="child_label", marker="o", ax=ax, | |
| ) | |
| ax.set_title(feat) | |
| ax.set_xlabel("Session order") | |
| ax.set_ylabel("") | |
| ax.legend(fontsize=8, loc="best") | |
| # Hide unused subplots | |
| for ax in axes.flat[len(feats):]: | |
| ax.set_visible(False) | |
| fig.suptitle("Longitudinal: per-child trajectories across sessions\n(Rollins + Flusberg + Quigley)", y=1.00) | |
| fig.tight_layout() | |
| out = FIG_DIR / "longitudinal_trajectories.png" | |
| fig.savefig(out, dpi=150, bbox_inches="tight") | |
| plt.close(fig) | |
| print(f" saved {out.relative_to(PROJECT_ROOT)}") | |
| def compute_composite_score(df: pd.DataFrame) -> pd.DataFrame: | |
| """Per-session composite score: z-scored features combined with direction. | |
| Score = mean over features of (z(feature) * direction). | |
| Higher = better / more typical language production. | |
| """ | |
| df = df.copy() | |
| zdf = pd.DataFrame(index=df.index) | |
| for feat, direction in FEATURE_DIRECTION.items(): | |
| x = df[feat].astype(float) | |
| mu, sd = x.mean(), x.std(ddof=0) | |
| if sd == 0 or np.isnan(sd): | |
| zdf[feat] = 0.0 | |
| else: | |
| zdf[feat] = direction * (x - mu) / sd | |
| df["composite_score"] = zdf.mean(axis=1) | |
| return df | |
| def plot_composite(df: pd.DataFrame) -> None: | |
| fig, ax = plt.subplots(figsize=(12, 7)) | |
| # Create child+corpus label for clarity | |
| df = df.copy() | |
| df["child_label"] = df["child"] + " (" + df["corpus"].str[:4] + ")" | |
| sns.lineplot( | |
| data=df, x="session_order", y="composite_score", | |
| hue="child_label", marker="o", linewidth=2.5, ax=ax, | |
| ) | |
| ax.axhline(0, color="gray", linestyle="--", alpha=0.5) | |
| ax.set_title("Longitudinal composite progress score\n(z-scored, direction-adjusted)\n(Rollins + Flusberg + Quigley)") | |
| ax.set_xlabel("Session order") | |
| ax.set_ylabel("Composite score (higher = better)") | |
| fig.tight_layout() | |
| out = FIG_DIR / "longitudinal_composite_score.png" | |
| fig.savefig(out, dpi=150, bbox_inches="tight") | |
| plt.close(fig) | |
| print(f" saved {out.relative_to(PROJECT_ROOT)}") | |
| def main() -> None: | |
| csv_path = DATA_DIR / "longitudinal_features.csv" | |
| df = pd.read_csv(csv_path) | |
| df = df.sort_values(["child", "session_order"]).reset_index(drop=True) | |
| print(f"Loaded {len(df)} sessions from {df['corpus'].nunique()} corpora " | |
| f"for {df['child'].nunique()} children.\n") | |
| print("Corpus distribution:") | |
| print(df.groupby("corpus")["child"].nunique()) | |
| print() | |
| # Per-child per-feature linear trends | |
| trends = compute_trends(df) | |
| # Add corpus info to trends | |
| child_corpus = df.groupby("child")["corpus"].first().to_dict() | |
| trends["corpus"] = trends["child"].map(child_corpus) | |
| out = METRIC_DIR / "longitudinal_trends.csv" | |
| trends.to_csv(out, index=False) | |
| print(f"[saved] {out.relative_to(PROJECT_ROOT)}") | |
| # First-vs-last session summary | |
| summary = compute_progress_summary(df) | |
| # Add corpus info to summary | |
| summary["corpus"] = summary["child"].map(child_corpus) | |
| out = METRIC_DIR / "longitudinal_progress_summary.csv" | |
| summary.to_csv(out, index=False) | |
| print(f"[saved] {out.relative_to(PROJECT_ROOT)}") | |
| # Plots | |
| plot_trajectories(df) | |
| df_c = compute_composite_score(df) | |
| plot_composite(df_c) | |
| # Print human-readable verdict | |
| print("\n=== Per-child improvement summary ===") | |
| improved_flags = ( | |
| trends.groupby("child")["improving"].sum() | |
| .reindex(sorted(trends["child"].unique())) | |
| ) | |
| total_feats = len(FEATURE_DIRECTION) | |
| for child, n_improved in improved_flags.items(): | |
| verdict = "IMPROVING" if n_improved > total_feats / 2 else "mixed / stalling" | |
| print(f" {child:8s} {int(n_improved)}/{total_feats} features improving -> {verdict}") | |
| print("\n=== Top improving child-feature pairs (by |r|) ===") | |
| sig = trends[trends["improving"]].copy() | |
| sig = sig.reindex(sig["r"].abs().sort_values(ascending=False).index) | |
| print(sig.head(10).to_string(index=False)) | |
| print("\n=== Composite score: first vs last session ===") | |
| for child, g in df_c.groupby("child"): | |
| g = g.sort_values("session_order") | |
| first = g["composite_score"].iloc[0] | |
| last = g["composite_score"].iloc[-1] | |
| arrow = "↑" if last > first else "↓" | |
| print(f" {child:8s} {first:+.2f} -> {last:+.2f} " | |
| f"({arrow} delta = {last - first:+.2f})") | |
| print("\n[done] progress tracking complete.") | |
| if __name__ == "__main__": | |
| main() | |